karafka-web 0.7.10 → 0.8.0.rc1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (163) hide show
  1. checksums.yaml +4 -4
  2. checksums.yaml.gz.sig +0 -0
  3. data/.github/workflows/ci.yml +18 -5
  4. data/.ruby-version +1 -1
  5. data/CHANGELOG.md +63 -0
  6. data/Gemfile.lock +22 -22
  7. data/docker-compose.yml +3 -1
  8. data/karafka-web.gemspec +2 -2
  9. data/lib/karafka/web/config.rb +16 -3
  10. data/lib/karafka/web/contracts/config.rb +7 -2
  11. data/lib/karafka/web/errors.rb +12 -0
  12. data/lib/karafka/web/inflector.rb +33 -0
  13. data/lib/karafka/web/installer.rb +20 -11
  14. data/lib/karafka/web/management/actions/base.rb +36 -0
  15. data/lib/karafka/web/management/actions/clean_boot_file.rb +33 -0
  16. data/lib/karafka/web/management/actions/create_initial_states.rb +77 -0
  17. data/lib/karafka/web/management/actions/create_topics.rb +139 -0
  18. data/lib/karafka/web/management/actions/delete_topics.rb +30 -0
  19. data/lib/karafka/web/management/actions/enable.rb +117 -0
  20. data/lib/karafka/web/management/actions/extend_boot_file.rb +39 -0
  21. data/lib/karafka/web/management/actions/migrate_states_data.rb +18 -0
  22. data/lib/karafka/web/management/migrations/0_base.rb +58 -0
  23. data/lib/karafka/web/management/migrations/0_set_initial_consumers_metrics.rb +36 -0
  24. data/lib/karafka/web/management/migrations/0_set_initial_consumers_state.rb +43 -0
  25. data/lib/karafka/web/management/migrations/1699543515_fill_missing_received_and_sent_bytes_in_consumers_metrics.rb +26 -0
  26. data/lib/karafka/web/management/migrations/1699543515_fill_missing_received_and_sent_bytes_in_consumers_state.rb +23 -0
  27. data/lib/karafka/web/management/migrations/1700234522_introduce_waiting_in_consumers_metrics.rb +24 -0
  28. data/lib/karafka/web/management/migrations/1700234522_introduce_waiting_in_consumers_state.rb +20 -0
  29. data/lib/karafka/web/management/migrations/1700234522_remove_processing_from_consumers_metrics.rb +24 -0
  30. data/lib/karafka/web/management/migrations/1700234522_remove_processing_from_consumers_state.rb +20 -0
  31. data/lib/karafka/web/management/migrations/1704722380_split_listeners_into_active_and_paused_in_metrics.rb +36 -0
  32. data/lib/karafka/web/management/migrations/1704722380_split_listeners_into_active_and_paused_in_states.rb +32 -0
  33. data/lib/karafka/web/management/migrator.rb +117 -0
  34. data/lib/karafka/web/processing/consumer.rb +39 -38
  35. data/lib/karafka/web/processing/consumers/aggregators/metrics.rb +2 -3
  36. data/lib/karafka/web/processing/consumers/aggregators/state.rb +8 -3
  37. data/lib/karafka/web/processing/consumers/contracts/aggregated_stats.rb +5 -1
  38. data/lib/karafka/web/processing/publisher.rb +59 -0
  39. data/lib/karafka/web/tracking/consumers/contracts/job.rb +3 -2
  40. data/lib/karafka/web/tracking/consumers/contracts/partition.rb +1 -0
  41. data/lib/karafka/web/tracking/consumers/contracts/report.rb +6 -1
  42. data/lib/karafka/web/tracking/consumers/contracts/subscription_group.rb +10 -1
  43. data/lib/karafka/web/tracking/consumers/listeners/connections.rb +49 -0
  44. data/lib/karafka/web/tracking/consumers/listeners/pausing.rb +7 -4
  45. data/lib/karafka/web/tracking/consumers/listeners/processing.rb +78 -70
  46. data/lib/karafka/web/tracking/consumers/listeners/statistics.rb +40 -13
  47. data/lib/karafka/web/tracking/consumers/sampler.rb +82 -25
  48. data/lib/karafka/web/tracking/helpers/ttls/array.rb +72 -0
  49. data/lib/karafka/web/tracking/helpers/ttls/hash.rb +34 -0
  50. data/lib/karafka/web/tracking/helpers/ttls/stats.rb +49 -0
  51. data/lib/karafka/web/tracking/helpers/ttls/windows.rb +32 -0
  52. data/lib/karafka/web/tracking/reporter.rb +1 -0
  53. data/lib/karafka/web/ui/app.rb +22 -4
  54. data/lib/karafka/web/ui/base.rb +18 -2
  55. data/lib/karafka/web/ui/controllers/base.rb +34 -4
  56. data/lib/karafka/web/ui/controllers/become_pro.rb +1 -1
  57. data/lib/karafka/web/ui/controllers/cluster.rb +33 -9
  58. data/lib/karafka/web/ui/controllers/consumers.rb +8 -2
  59. data/lib/karafka/web/ui/controllers/dashboard.rb +2 -2
  60. data/lib/karafka/web/ui/controllers/errors.rb +2 -2
  61. data/lib/karafka/web/ui/controllers/jobs.rb +55 -5
  62. data/lib/karafka/web/ui/controllers/requests/params.rb +5 -0
  63. data/lib/karafka/web/ui/controllers/responses/deny.rb +15 -0
  64. data/lib/karafka/web/ui/controllers/responses/file.rb +23 -0
  65. data/lib/karafka/web/ui/controllers/responses/{data.rb → render.rb} +3 -3
  66. data/lib/karafka/web/ui/controllers/routing.rb +11 -2
  67. data/lib/karafka/web/ui/controllers/status.rb +1 -1
  68. data/lib/karafka/web/ui/helpers/application_helper.rb +70 -0
  69. data/lib/karafka/web/ui/lib/hash_proxy.rb +29 -14
  70. data/lib/karafka/web/ui/lib/sorter.rb +170 -0
  71. data/lib/karafka/web/ui/models/counters.rb +6 -0
  72. data/lib/karafka/web/ui/models/health.rb +23 -2
  73. data/lib/karafka/web/ui/models/jobs.rb +48 -0
  74. data/lib/karafka/web/ui/models/metrics/charts/aggregated.rb +33 -0
  75. data/lib/karafka/web/ui/models/metrics/charts/topics.rb +1 -1
  76. data/lib/karafka/web/ui/models/process.rb +2 -1
  77. data/lib/karafka/web/ui/models/status.rb +23 -7
  78. data/lib/karafka/web/ui/models/topic.rb +3 -1
  79. data/lib/karafka/web/ui/models/visibility_filter.rb +16 -0
  80. data/lib/karafka/web/ui/pro/app.rb +44 -6
  81. data/lib/karafka/web/ui/pro/controllers/cluster.rb +1 -0
  82. data/lib/karafka/web/ui/pro/controllers/consumers.rb +52 -6
  83. data/lib/karafka/web/ui/pro/controllers/dashboard.rb +1 -1
  84. data/lib/karafka/web/ui/pro/controllers/dlq.rb +1 -1
  85. data/lib/karafka/web/ui/pro/controllers/errors.rb +3 -3
  86. data/lib/karafka/web/ui/pro/controllers/explorer.rb +8 -8
  87. data/lib/karafka/web/ui/pro/controllers/health.rb +34 -2
  88. data/lib/karafka/web/ui/pro/controllers/jobs.rb +11 -0
  89. data/lib/karafka/web/ui/pro/controllers/messages.rb +42 -0
  90. data/lib/karafka/web/ui/pro/controllers/routing.rb +11 -2
  91. data/lib/karafka/web/ui/pro/views/consumers/_breadcrumbs.erb +8 -2
  92. data/lib/karafka/web/ui/pro/views/consumers/_consumer.erb +14 -8
  93. data/lib/karafka/web/ui/pro/views/consumers/_counters.erb +8 -6
  94. data/lib/karafka/web/ui/pro/views/consumers/consumer/_job.erb +4 -1
  95. data/lib/karafka/web/ui/pro/views/consumers/consumer/_no_jobs.erb +1 -1
  96. data/lib/karafka/web/ui/pro/views/consumers/consumer/_partition.erb +1 -3
  97. data/lib/karafka/web/ui/pro/views/consumers/consumer/_subscription_group.erb +28 -11
  98. data/lib/karafka/web/ui/pro/views/consumers/consumer/_tabs.erb +10 -3
  99. data/lib/karafka/web/ui/pro/views/consumers/index.erb +3 -3
  100. data/lib/karafka/web/ui/pro/views/consumers/pending_jobs.erb +43 -0
  101. data/lib/karafka/web/ui/pro/views/consumers/{jobs.erb → running_jobs.erb} +11 -10
  102. data/lib/karafka/web/ui/pro/views/dashboard/index.erb +7 -1
  103. data/lib/karafka/web/ui/pro/views/explorer/message/_message_actions.erb +18 -0
  104. data/lib/karafka/web/ui/pro/views/explorer/message/_metadata.erb +43 -0
  105. data/lib/karafka/web/ui/pro/views/explorer/message/_payload.erb +21 -0
  106. data/lib/karafka/web/ui/pro/views/explorer/message/_payload_actions.erb +19 -0
  107. data/lib/karafka/web/ui/pro/views/explorer/show.erb +9 -84
  108. data/lib/karafka/web/ui/pro/views/health/_breadcrumbs.erb +8 -0
  109. data/lib/karafka/web/ui/pro/views/health/_partition.erb +1 -3
  110. data/lib/karafka/web/ui/pro/views/health/_partition_offset.erb +4 -4
  111. data/lib/karafka/web/ui/pro/views/health/_partition_times.erb +32 -0
  112. data/lib/karafka/web/ui/pro/views/health/_tabs.erb +9 -0
  113. data/lib/karafka/web/ui/pro/views/health/changes.erb +66 -0
  114. data/lib/karafka/web/ui/pro/views/health/offsets.erb +14 -14
  115. data/lib/karafka/web/ui/pro/views/health/overview.erb +11 -11
  116. data/lib/karafka/web/ui/pro/views/jobs/_job.erb +1 -1
  117. data/lib/karafka/web/ui/pro/views/jobs/_no_jobs.erb +1 -1
  118. data/lib/karafka/web/ui/pro/views/jobs/pending.erb +39 -0
  119. data/lib/karafka/web/ui/pro/views/jobs/running.erb +39 -0
  120. data/lib/karafka/web/ui/pro/views/routing/_consumer_group.erb +2 -2
  121. data/lib/karafka/web/ui/pro/views/routing/_topic.erb +9 -0
  122. data/lib/karafka/web/ui/pro/views/routing/show.erb +12 -0
  123. data/lib/karafka/web/ui/pro/views/shared/_navigation.erb +1 -1
  124. data/lib/karafka/web/ui/public/javascripts/application.js +10 -0
  125. data/lib/karafka/web/ui/public/stylesheets/application.css +4 -0
  126. data/lib/karafka/web/ui/views/cluster/_breadcrumbs.erb +16 -0
  127. data/lib/karafka/web/ui/views/cluster/_tabs.erb +27 -0
  128. data/lib/karafka/web/ui/views/cluster/brokers.erb +27 -0
  129. data/lib/karafka/web/ui/views/cluster/topics.erb +35 -0
  130. data/lib/karafka/web/ui/views/consumers/_counters.erb +8 -6
  131. data/lib/karafka/web/ui/views/consumers/_summary.erb +2 -2
  132. data/lib/karafka/web/ui/views/consumers/index.erb +3 -3
  133. data/lib/karafka/web/ui/views/dashboard/_ranges_selector.erb +23 -7
  134. data/lib/karafka/web/ui/views/dashboard/index.erb +19 -8
  135. data/lib/karafka/web/ui/views/errors/show.erb +2 -23
  136. data/lib/karafka/web/ui/views/jobs/_breadcrumbs.erb +17 -1
  137. data/lib/karafka/web/ui/views/jobs/_job.erb +1 -1
  138. data/lib/karafka/web/ui/views/jobs/_no_jobs.erb +1 -1
  139. data/lib/karafka/web/ui/views/jobs/_tabs.erb +27 -0
  140. data/lib/karafka/web/ui/views/jobs/{index.erb → pending.erb} +9 -7
  141. data/lib/karafka/web/ui/{pro/views/jobs/index.erb → views/jobs/running.erb} +9 -11
  142. data/lib/karafka/web/ui/views/routing/_consumer_group.erb +14 -12
  143. data/lib/karafka/web/ui/views/shared/_navigation.erb +1 -1
  144. data/lib/karafka/web/ui/views/shared/_pagination.erb +1 -1
  145. data/lib/karafka/web/ui/views/shared/exceptions/not_allowed.erb +37 -0
  146. data/lib/karafka/web/ui/views/status/show.erb +17 -2
  147. data/lib/karafka/web/ui/views/status/warnings/_routing_topics_presence.erb +15 -0
  148. data/lib/karafka/web/version.rb +1 -1
  149. data/lib/karafka/web.rb +6 -2
  150. data.tar.gz.sig +0 -0
  151. metadata +61 -26
  152. metadata.gz.sig +0 -0
  153. data/lib/karafka/web/management/base.rb +0 -34
  154. data/lib/karafka/web/management/clean_boot_file.rb +0 -31
  155. data/lib/karafka/web/management/create_initial_states.rb +0 -101
  156. data/lib/karafka/web/management/create_topics.rb +0 -133
  157. data/lib/karafka/web/management/delete_topics.rb +0 -28
  158. data/lib/karafka/web/management/enable.rb +0 -102
  159. data/lib/karafka/web/management/extend_boot_file.rb +0 -37
  160. data/lib/karafka/web/tracking/ttl_array.rb +0 -59
  161. data/lib/karafka/web/tracking/ttl_hash.rb +0 -16
  162. data/lib/karafka/web/ui/pro/views/dashboard/_ranges_selector.erb +0 -39
  163. data/lib/karafka/web/ui/views/cluster/index.erb +0 -74
@@ -12,10 +12,33 @@ module Karafka
12
12
  # @param event [Karafka::Core::Monitoring::Event]
13
13
  def on_worker_processed(event)
14
14
  track do |sampler|
15
- sampler.times[:total] << event[:time]
15
+ sampler.windows.m1[:processed_total_time] << event[:time]
16
16
  end
17
17
  end
18
18
 
19
+ # We do not track idle jobs here because they are internal and not user-facing
20
+ %i[
21
+ consume
22
+ revoked
23
+ shutdown
24
+ tick
25
+ ].each do |action|
26
+ # Tracks the job that is going to be scheduled so we can also display pending jobs
27
+ class_eval <<~RUBY, __FILE__, __LINE__ + 1
28
+ # @param event [Karafka::Core::Monitoring::Event]
29
+ def on_consumer_before_schedule_#{action}(event)
30
+ consumer = event.payload[:caller]
31
+ jid = job_id(consumer, '#{action}')
32
+ job_details = job_details(consumer, '#{action}')
33
+ job_details[:status] = 'pending'
34
+
35
+ track do |sampler|
36
+ sampler.jobs[jid] = job_details
37
+ end
38
+ end
39
+ RUBY
40
+ end
41
+
19
42
  # Counts work execution and processing states in consumer instances
20
43
  #
21
44
  # @param event [Karafka::Core::Monitoring::Event]
@@ -34,6 +57,19 @@ module Karafka
34
57
  end
35
58
  end
36
59
 
60
+ # Collect info about consumption event that occurred and its metrics
61
+ # Removes the job from running jobs
62
+ #
63
+ # @param event [Karafka::Core::Monitoring::Event]
64
+ def on_consumer_consumed(event)
65
+ consumer = event.payload[:caller]
66
+ jid = job_id(consumer, 'consume')
67
+
68
+ track do |sampler|
69
+ sampler.jobs.delete(jid)
70
+ end
71
+ end
72
+
37
73
  # Removes failed job from active jobs
38
74
  #
39
75
  # @param event [Karafka::Core::Monitoring::Event]
@@ -46,6 +82,8 @@ module Karafka
46
82
  'revoked'
47
83
  when 'consumer.shutdown.error'
48
84
  'shutdown'
85
+ when 'consumer.tick.error'
86
+ 'tick'
49
87
  # This is not a user facing execution flow, but internal system one
50
88
  # that is why it will not be reported as a separate job for the UI
51
89
  when 'consumer.idle.error'
@@ -65,72 +103,39 @@ module Karafka
65
103
  end
66
104
  end
67
105
 
68
- # Collect info about consumption event that occurred and its metrics
69
- # Removes the job from running jobs
70
- #
71
- # @param event [Karafka::Core::Monitoring::Event]
72
- def on_consumer_consumed(event)
73
- consumer = event.payload[:caller]
74
- topic = consumer.topic
75
- consumer_group_id = topic.consumer_group.id
76
- messages_count = consumer.messages.size
77
- time = event[:time]
78
- jid = job_id(consumer, 'consume')
79
-
80
- track do |sampler|
81
- sampler.jobs.delete(jid)
82
- sampler.times[consumer_group_id] << [topic.name, time, messages_count]
83
- end
84
- end
85
-
86
- # Stores this job details
87
- #
88
- # @param event [Karafka::Core::Monitoring::Event]
89
- def on_consumer_revoke(event)
90
- consumer = event.payload[:caller]
91
- jid = job_id(consumer, 'revoked')
92
- job_details = job_details(consumer, 'revoked')
93
-
94
- track do |sampler|
95
- sampler.jobs[jid] = job_details
96
- end
97
- end
98
-
99
- # Removes the job from running jobs
100
- #
101
- # @param event [Karafka::Core::Monitoring::Event]
102
- def on_consumer_revoked(event)
103
- consumer = event.payload[:caller]
104
- jid = job_id(consumer, 'revoked')
105
-
106
- track do |sampler|
107
- sampler.jobs.delete(jid)
108
- end
109
- end
110
-
111
- # Stores this job details
112
- #
113
- # @param event [Karafka::Core::Monitoring::Event]
114
- def on_consumer_shutting_down(event)
115
- consumer = event.payload[:caller]
116
- jid = job_id(consumer, 'shutdown')
117
- job_details = job_details(consumer, 'shutdown')
118
-
119
- track do |sampler|
120
- sampler.jobs[jid] = job_details
121
- end
122
- end
123
-
124
- # Removes the job from running jobs
125
- #
126
- # @param event [Karafka::Core::Monitoring::Event]
127
- def on_consumer_shutdown(event)
128
- consumer = event.payload[:caller]
129
- jid = job_id(consumer, 'shutdown')
130
-
131
- track do |sampler|
132
- sampler.jobs.delete(jid)
133
- end
106
+ # Consume has a bit different reporting flow than other jobs because it bumps certain
107
+ # counters that other jobs do not. This is why it is defined above separately
108
+ [
109
+ [:revoke, :revoked, 'revoked'],
110
+ [:shutting_down, :shutdown, 'shutdown'],
111
+ [:tick, :ticked, 'tick']
112
+ ].each do |pre, post, action|
113
+ class_eval <<~METHOD, __FILE__, __LINE__ + 1
114
+ # Stores this job details
115
+ #
116
+ # @param event [Karafka::Core::Monitoring::Event]
117
+ def on_consumer_#{pre}(event)
118
+ consumer = event.payload[:caller]
119
+ jid = job_id(consumer, '#{action}')
120
+ job_details = job_details(consumer, '#{action}')
121
+
122
+ track do |sampler|
123
+ sampler.jobs[jid] = job_details
124
+ end
125
+ end
126
+
127
+ # Removes the job from running jobs
128
+ #
129
+ # @param event [Karafka::Core::Monitoring::Event]
130
+ def on_consumer_#{post}(event)
131
+ consumer = event.payload[:caller]
132
+ jid = job_id(consumer, '#{action}')
133
+
134
+ track do |sampler|
135
+ sampler.jobs.delete(jid)
136
+ end
137
+ end
138
+ METHOD
134
139
  end
135
140
 
136
141
  private
@@ -152,14 +157,16 @@ module Karafka
152
157
  # more details.
153
158
  def job_details(consumer, type)
154
159
  {
155
- started_at: float_now,
160
+ updated_at: float_now,
156
161
  topic: consumer.topic.name,
157
162
  partition: consumer.partition,
158
163
  first_offset: consumer.messages.metadata.first_offset,
159
164
  last_offset: consumer.messages.metadata.last_offset,
160
165
  processing_lag: consumer.messages.metadata.processing_lag,
161
166
  consumption_lag: consumer.messages.metadata.consumption_lag,
162
- committed_offset: consumer.coordinator.seek_offset - 1,
167
+ # Committed offset may be -1 when there is no committed offset. This can happen in
168
+ # case of ticking that started before any consumption job happened
169
+ committed_offset: consumer.coordinator.seek_offset.to_i - 1,
163
170
  # In theory this is redundant because we have first and last offset, but it is
164
171
  # needed because VPs do not have linear count. For VPs first and last offset
165
172
  # will be further away than the total messages count for a particular VP
@@ -167,7 +174,8 @@ module Karafka
167
174
  consumer: consumer.class.to_s,
168
175
  consumer_group: consumer.topic.consumer_group.id,
169
176
  type: type,
170
- tags: consumer.tags
177
+ tags: consumer.tags,
178
+ status: 'running'
171
179
  }
172
180
  end
173
181
  end
@@ -18,6 +18,8 @@ module Karafka
18
18
  sg_id = event[:subscription_group_id]
19
19
  sg_details = extract_sg_details(sg_id, cgrp)
20
20
 
21
+ track_transfers(statistics)
22
+
21
23
  # More than one subscription group from the same consumer group may be reporting
22
24
  # almost the same time. To prevent corruption of partial data, we put everything here
23
25
  # in track as we merge data from multiple subscription groups
@@ -42,25 +44,41 @@ module Karafka
42
44
  }
43
45
 
44
46
  topic_details[:partitions][pt_id] = metrics.merge(
45
- id: pt_id,
47
+ id: pt_id
48
+ ).merge(
46
49
  # Pauses are stored on a consumer group since we do not process same topic
47
50
  # twice in the multiple subscription groups
48
- poll_state: poll_state(cg_id, topic_name, pt_id)
51
+ poll_details(sg_id, topic_name, pt_id)
49
52
  )
50
53
  end
51
54
  end
52
55
 
53
- sampler.consumer_groups[cg_id] ||= {
54
- id: cg_id,
55
- subscription_groups: {}
56
- }
57
-
58
56
  sampler.consumer_groups[cg_id][:subscription_groups][sg_id] = sg_details
59
57
  end
60
58
  end
61
59
 
62
60
  private
63
61
 
62
+ # Tracks network transfers from and to the client using a 1 minute rolling window
63
+ #
64
+ # @param statistics [Hash] statistics hash
65
+ def track_transfers(statistics)
66
+ brokers = statistics.fetch('brokers', {})
67
+
68
+ return if brokers.empty?
69
+
70
+ track do |sampler|
71
+ client_name = statistics.fetch('name')
72
+
73
+ brokers.each do |broker_name, values|
74
+ scope_name = "#{client_name}-#{broker_name}"
75
+
76
+ sampler.windows.m1["#{scope_name}-rxbytes"] << values.fetch('rxbytes', 0)
77
+ sampler.windows.m1["#{scope_name}-txbytes"] << values.fetch('txbytes', 0)
78
+ end
79
+ end
80
+ end
81
+
64
82
  # Extracts basic consumer group related details
65
83
  # @param sg_id [String]
66
84
  # @param sg_stats [Hash]
@@ -75,7 +93,7 @@ module Karafka
75
93
  'rebalance_age',
76
94
  'rebalance_cnt',
77
95
  'rebalance_reason'
78
- ),
96
+ ).transform_keys(&:to_sym),
79
97
  topics: {}
80
98
  }
81
99
  end
@@ -132,14 +150,23 @@ module Karafka
132
150
  metrics
133
151
  end
134
152
 
135
- # @param cg_id [String]
153
+ # @param sg_id [String] subscription group id
136
154
  # @param topic_name [String]
137
- # @param pt_id [Integer]
155
+ # @param pt_id [Integer] partition id
138
156
  # @return [String] poll state / is partition paused or not
139
- def poll_state(cg_id, topic_name, pt_id)
140
- pause_id = [cg_id, topic_name, pt_id].join('-')
157
+ def poll_details(sg_id, topic_name, pt_id)
158
+ pause_id = [sg_id, topic_name, pt_id].join('-')
159
+
160
+ details = { poll_state: 'active', poll_state_ch: 0 }
141
161
 
142
- sampler.pauses.include?(pause_id) ? 'paused' : 'active'
162
+ pause_details = sampler.pauses[pause_id]
163
+
164
+ return details unless pause_details
165
+
166
+ {
167
+ poll_state: 'paused',
168
+ poll_state_ch: [(pause_details.fetch(:paused_till) - monotonic_now).round, 0].max
169
+ }
143
170
  end
144
171
  end
145
172
  end
@@ -9,18 +9,13 @@ module Karafka
9
9
  class Sampler < Tracking::Sampler
10
10
  include ::Karafka::Core::Helpers::Time
11
11
 
12
- attr_reader :counters, :consumer_groups, :errors, :times, :pauses, :jobs
12
+ attr_reader :counters, :consumer_groups, :subscription_groups, :errors,
13
+ :pauses, :jobs, :windows
13
14
 
14
15
  # Current schema version
15
- # This can be used in the future for detecting incompatible changes and writing
16
- # migrations
17
- SCHEMA_VERSION = '1.2.3'
18
-
19
- # 60 seconds window for time tracked window-based metrics
20
- TIMES_TTL = 60
21
-
22
- # Times ttl in ms
23
- TIMES_TTL_MS = TIMES_TTL * 1_000
16
+ # This is used for detecting incompatible changes and not using outdated data during
17
+ # upgrades
18
+ SCHEMA_VERSION = '1.2.9'
24
19
 
25
20
  # Counters that count events occurrences during the given window
26
21
  COUNTERS_BASE = {
@@ -36,17 +31,23 @@ module Karafka
36
31
  dead: 0
37
32
  }.freeze
38
33
 
39
- private_constant :TIMES_TTL, :TIMES_TTL_MS, :COUNTERS_BASE
34
+ private_constant :COUNTERS_BASE
40
35
 
41
36
  def initialize
42
37
  super
43
38
 
39
+ @windows = Helpers::Ttls::Windows.new
44
40
  @counters = COUNTERS_BASE.dup
45
- @times = TtlHash.new(TIMES_TTL_MS)
46
- @consumer_groups = {}
41
+ @consumer_groups = Hash.new do |h, cg_id|
42
+ h[cg_id] = {
43
+ id: cg_id,
44
+ subscription_groups: {}
45
+ }
46
+ end
47
+ @subscription_groups = {}
47
48
  @errors = []
48
49
  @started_at = float_now
49
- @pauses = Set.new
50
+ @pauses = {}
50
51
  @jobs = {}
51
52
  @shell = MemoizedShell.new
52
53
  @memory_total_usage = 0
@@ -81,7 +82,9 @@ module Karafka
81
82
  cpus: cpus,
82
83
  threads: threads,
83
84
  cpu_usage: @cpu_usage,
84
- tags: Karafka::Process.tags
85
+ tags: Karafka::Process.tags,
86
+ bytes_received: bytes_received,
87
+ bytes_sent: bytes_sent
85
88
  },
86
89
 
87
90
  versions: {
@@ -98,7 +101,7 @@ module Karafka
98
101
  utilization: utilization
99
102
  ).merge(total: @counters),
100
103
 
101
- consumer_groups: @consumer_groups,
104
+ consumer_groups: enriched_consumer_groups,
102
105
  jobs: jobs.values
103
106
  }
104
107
  end
@@ -130,21 +133,28 @@ module Karafka
130
133
  # utilized all the time within the given time window. 0% means, nothing is happening
131
134
  # most if not all the time.
132
135
  def utilization
133
- return 0 if times[:total].empty?
136
+ totals = windows.m1[:processed_total_time]
137
+
138
+ return 0 if totals.empty?
134
139
 
135
- # Max times ttl
136
140
  timefactor = float_now - @started_at
137
- timefactor = timefactor > TIMES_TTL ? TIMES_TTL : timefactor
141
+ timefactor = timefactor > 60 ? 60 : timefactor
138
142
 
139
143
  # We divide by 1_000 to convert from milliseconds
140
144
  # We multiply by 100 to have it in % scale
141
- times[:total].sum / 1_000 / workers / timefactor * 100
145
+ totals.sum / 1_000 / workers / timefactor * 100
142
146
  end
143
147
 
144
- # @return [Integer] number of listeners
148
+ # @return [Hash] number of active and standby listeners
145
149
  def listeners
146
- # This can be zero before the server starts
147
- Karafka::Server.listeners&.count.to_i
150
+ if Karafka::Server.listeners
151
+ active = Karafka::Server.listeners.count(&:active?)
152
+ total = Karafka::Server.listeners.count.to_i
153
+
154
+ { active: active, standby: total - active }
155
+ else
156
+ { active: 0, standby: 0 }
157
+ end
148
158
  end
149
159
 
150
160
  # @return [Integer] memory used by this process in kilobytes
@@ -175,9 +185,14 @@ module Karafka
175
185
  # @return [Hash] job queue statistics
176
186
  def jobs_queue_statistics
177
187
  # We return empty stats in case jobs queue is not yet initialized
188
+ base = Karafka::Server.jobs_queue&.statistics || { busy: 0, enqueued: 0 }
189
+ stats = base.slice(:busy, :enqueued, :waiting)
190
+ stats[:waiting] ||= 0
178
191
  # busy - represents number of jobs that are being executed currently
179
- # enqueued - represents number of jobs that are enqueued to be processed
180
- Karafka::Server.jobs_queue&.statistics || { busy: 0, enqueued: 0 }
192
+ # enqueued - jobs that are in the queue but not being picked up yet
193
+ # waiting - jobs that are not scheduled on the queue but will be
194
+ # be enqueued in case of advanced schedulers
195
+ stats
181
196
  end
182
197
 
183
198
  # Total memory used in the OS
@@ -265,6 +280,48 @@ module Karafka
265
280
  @memory_threads_ps = false
266
281
  end
267
282
  end
283
+
284
+ # Consumer group details need to be enriched with details about polling that comes from
285
+ # Karafka level. It is also time based, hence we need to materialize it only at the
286
+ # moment of message dispatch to have it accurate.
287
+ def enriched_consumer_groups
288
+ @consumer_groups.each_value do |cg_details|
289
+ cg_details.each do
290
+ cg_details.fetch(:subscription_groups, {}).each do |sg_id, sg_details|
291
+ # This should be always available, since we subscription group polled at time
292
+ # is first initialized before we start polling, there should be no case where
293
+ # we have statistics about a given subscription group but we do not have the
294
+ # last polling time
295
+ polled_at = subscription_groups.fetch(sg_id).fetch(:polled_at)
296
+ sg_details[:state][:poll_age] = monotonic_now - polled_at
297
+ end
298
+ end
299
+ end
300
+
301
+ @consumer_groups
302
+ end
303
+
304
+ # @return [Integer] number of bytes received per second out of a one minute time window
305
+ # by all the consumers
306
+ # @note We use one minute window to compensate for cases where metrics would be reported
307
+ # or recorded faster or slower. This normalizes data
308
+ def bytes_received
309
+ @windows
310
+ .m1
311
+ .stats_from { |k, _v| k.end_with?('rxbytes') }
312
+ .rps
313
+ .round
314
+ end
315
+
316
+ # @return [Integer] number of bytes sent per second out of a one minute time window by
317
+ # all the consumers
318
+ def bytes_sent
319
+ @windows
320
+ .m1
321
+ .stats_from { |k, _v| k.end_with?('txbytes') }
322
+ .rps
323
+ .round
324
+ end
268
325
  end
269
326
  end
270
327
  end
@@ -0,0 +1,72 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Karafka
4
+ module Web
5
+ module Tracking
6
+ module Helpers
7
+ # Namespace for time sensitive related buffers and operators
8
+ module Ttls
9
+ # Array that allows us to store data points that expire over time automatically.
10
+ class Array
11
+ include ::Karafka::Core::Helpers::Time
12
+ include Enumerable
13
+
14
+ # @param ttl [Integer] milliseconds ttl
15
+ def initialize(ttl)
16
+ @ttl = ttl
17
+ @accu = []
18
+ end
19
+
20
+ # Iterates over only active elements
21
+ def each
22
+ clear
23
+
24
+ @accu.each do |sample|
25
+ yield sample[:value]
26
+ end
27
+ end
28
+
29
+ # @param value [Object] adds value to the array
30
+ # @return [Object] added element
31
+ def <<(value)
32
+ @accu << { value: value, added_at: monotonic_now }
33
+
34
+ clear
35
+
36
+ value
37
+ end
38
+
39
+ # @return [Boolean] is the array empty
40
+ def empty?
41
+ clear
42
+ @accu.empty?
43
+ end
44
+
45
+ # Samples that are within our TTL time window with the times
46
+ #
47
+ # @return [Hash]
48
+ def samples
49
+ clear
50
+ @accu
51
+ end
52
+
53
+ # @return [::Array] pure array version with only active elements
54
+ def to_a
55
+ clear
56
+ super
57
+ end
58
+
59
+ private
60
+
61
+ # Evicts outdated samples
62
+ def clear
63
+ @accu.delete_if do |sample|
64
+ monotonic_now - sample[:added_at] > @ttl
65
+ end
66
+ end
67
+ end
68
+ end
69
+ end
70
+ end
71
+ end
72
+ end
@@ -0,0 +1,34 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Karafka
4
+ module Web
5
+ module Tracking
6
+ module Helpers
7
+ module Ttls
8
+ # Hash that accumulates data that has an expiration date (ttl)
9
+ # Used to keep track of metrics in a window
10
+ class Hash < Hash
11
+ # @param ttl [Integer] milliseconds ttl
12
+ def initialize(ttl)
13
+ super() { |k, v| k[v] = Ttls::Array.new(ttl) }
14
+ end
15
+
16
+ # Takes a block where we provide a hash select filtering to select keys we are
17
+ # interested in using for aggregated stats. Once filtered, builds a Stats object out
18
+ # of the candidates
19
+ #
20
+ # @param block [Proc] block for selection of elements for stats
21
+ # @yieldparam [String] key
22
+ # @yieldparam [Ttls::Array] samples
23
+ # @return [Stats]
24
+ def stats_from(&block)
25
+ Stats.new(
26
+ select(&block)
27
+ )
28
+ end
29
+ end
30
+ end
31
+ end
32
+ end
33
+ end
34
+ end
@@ -0,0 +1,49 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Karafka
4
+ module Web
5
+ module Tracking
6
+ module Helpers
7
+ module Ttls
8
+ # Object that simplifies computing aggregated statistics out of ttl data
9
+ # For TTL based operations we may collect samples from multiple consumers/producers etc
10
+ # but in the end we are interested in the collective result of the whole process.
11
+ #
12
+ # For example when we talk about data received from Kafka, we want to materialize total
13
+ # number of bytes and not bytes per given client connection. This layer simplifies this
14
+ # by doing necessary aggregations and providing the final results
15
+ class Stats
16
+ # @param ttls_hash [Ttls::Hash, Hash] hash with window based samples
17
+ def initialize(ttls_hash)
18
+ @data = ttls_hash
19
+ .values
20
+ .map(&:samples)
21
+ .map(&:to_a)
22
+ .delete_if { |samples| samples.size < 2 }
23
+ .map { |samples| samples.map(&:values) }
24
+ end
25
+
26
+ # Computes the rate out of the samples provided on a per second basis. The samples need
27
+ # to come from the window aggregations
28
+ #
29
+ # @return [Float] per second rate value
30
+ def rps
31
+ sub_results = @data.map do |samples|
32
+ oldest = samples.first
33
+ newest = samples.last
34
+
35
+ value = oldest[0] - newest[0]
36
+ # Convert to seconds as we want to have it in a 1 sec pace
37
+ time = (oldest[1] - newest[1]) / 1_000
38
+
39
+ value / time.to_f
40
+ end
41
+
42
+ sub_results.flatten.sum
43
+ end
44
+ end
45
+ end
46
+ end
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,32 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Karafka
4
+ module Web
5
+ module Tracking
6
+ module Helpers
7
+ module Ttls
8
+ # Object used to track process metrics in time windows. Those are shared, meaning they do
9
+ # not refer to particular metric type but allow us to store whatever we want.
10
+ #
11
+ # We have following time windows:
12
+ # - m1 - one minute big
13
+ # - m5 - five minute big
14
+ Windows = Struct.new(:m1, :m5) do
15
+ # @return [Ttls::Windows]
16
+ def initialize
17
+ super(
18
+ Ttls::Hash.new(60 * 1_000),
19
+ Ttls::Hash.new(5 * 60 * 1_000)
20
+ )
21
+ end
22
+
23
+ # Clears the TTLs windows
24
+ def clear
25
+ values.each(&:clear)
26
+ end
27
+ end
28
+ end
29
+ end
30
+ end
31
+ end
32
+ end
@@ -16,6 +16,7 @@ module Karafka
16
16
  def active?
17
17
  return false unless ::Karafka::Web.producer
18
18
  return false unless ::Karafka::Web.producer.status.active?
19
+ return false unless ::Karafka::Web.config.tracking.active
19
20
 
20
21
  true
21
22
  end