RubyGems - karafka-web - Versions diffs - 0.6.3 → 0.7.0 - Mend

karafka-web 0.6.3 → 0.7.0

Files changed (214) hide show

data/lib/karafka/web/ui/models/message.rb CHANGED Viewed

@@ -7,6 +7,8 @@ module Karafka
         # A proxy between `::Karafka::Messages::Message` and web UI
         # We work with the Karafka messages but use this model to wrap the work needed.
         class Message
+          extend Lib::Paginations::Paginators
           class << self
             # Looks for a message from a given topic partition
             #
@@ -30,41 +32,44 @@ module Karafka
               )
             end
-            # Fetches requested page of Kafka messages.
+            # Fetches requested `page_count` number of Kafka messages starting from the oldest
+            # requested `start_offset`. If `start_offset` is `-1`, will fetch the most recent
+            # results
             #
             # @param topic_id [String]
             # @param partition_id [Integer]
-            # @param page [Integer]
-            # @return [Array] We return both page data as well as all the details needed to build
+            # @param start_offset [Integer] oldest offset from which we want to get the data
+            # @param watermark_offsets [Ui::Models::WatermarkOffsets] watermark offsets
+            # @return [Array] We return page data as well as all the details needed to build
             #   the pagination details.
-            def page(topic_id, partition_id, page)
-              low_offset, high_offset = Karafka::Admin.read_watermark_offsets(
-                topic_id,
-                partition_id
-              )
+            def offset_page(topic_id, partition_id, start_offset, watermark_offsets)
+              low_offset = watermark_offsets.low
+              high_offset = watermark_offsets.high
-              partitions_count = fetch_partition_count(topic_id)
+              # If we start from offset -1, it means we want first page with the most recent
+              # results. We obtain this page by using the offset based on the high watermark
+              # off
+              start_offset = high_offset - per_page if start_offset == -1
-              no_data_result = [false, [], false, partitions_count]
+              # No previous pages, no data, and no more offsets
+              no_data_result = [false, [], false]
-              # If there is not even one message, we need to early exit
-              # If low and high watermark offsets are of the same value, it means no data in the
-              # topic is present
+              # If there is no data, we return the no results result
               return no_data_result if low_offset == high_offset
-              # We add plus one because we compute previous offset from which we want to start and
-              # not previous page leading offset
-              start_offset = high_offset - (per_page * page)
               if start_offset <= low_offset
+                # If this page does not contain max per page, compute how many messages we can
+                # fetch before stopping
                 count = per_page - (low_offset - start_offset)
-                previous_page = page < 2 ? false : page - 1
-                next_page = false
+                next_offset = false
                 start_offset = low_offset
               else
-                previous_page = page < 2 ? false : page - 1
-                next_page = page + 1
-                count = per_page
+                next_offset = start_offset - per_page
+                # Do not go below the lowest possible offset
+                next_offset = low_offset if next_offset < low_offset
+                count = high_offset - start_offset
+                # If there would be more messages that we want to get, force max
+                count = per_page if count > per_page
               end
               # This code is a bit tricky. Since topics can be compacted and certain offsets may
@@ -93,17 +98,97 @@ module Karafka
                 next unless messages
+                previous_offset = start_offset + count
                 return [
-                  previous_page,
-                  fill_compacted(messages, context_offset, context_count).reverse,
-                  next_page,
-                  partitions_count
+                  # If there is a potential previous page with more recent data, compute its
+                  # offset
+                  previous_offset >= high_offset ? false : previous_offset,
+                  fill_compacted(messages, partition_id, context_offset, context_count, high_offset).reverse,
+                  next_offset
                 ]
               end
               no_data_result
             end
+            # Fetches requested `page_count` number of Kafka messages from the topic partitions
+            # and merges the results. Ensures, that pagination works as expected.
+            #
+            # @param topic_id [String]
+            # @param partitions_ids [Array<Integer>] for which of the partitions we want to
+            #   get the data. This is a limiting factor because of the fact that we have to
+            #   query the watermark offsets independently
+            # @param page [Integer] which page we want to get
+            def topic_page(topic_id, partitions_ids, page)
+              # This is the bottleneck, for each partition we make one request :(
+              offsets = partitions_ids.map do |partition_id|
+                [partition_id, Models::WatermarkOffsets.find(topic_id, partition_id)]
+              end.to_h
+              # Count number of elements we have in each partition
+              # This assumes linear presence until low. If not, gaps will be filled like we fill
+              # for per partition view
+              counts = offsets.values.map { |offset| offset[:high] - offset[:low] }
+              # Establish initial offsets for the iterator (where to start) per partition
+              # We do not use the negative lookup iterator because we already can compute starting
+              # offsets. This saves a lot of calls to Kafka
+              ranges = Sets.call(counts, page).map do |partition_position, partition_range|
+                partition_id = partitions_ids.to_a[partition_position]
+                watermarks = offsets[partition_id]
+                lowest = watermarks[:high] - partition_range.last - 1
+                # We -1 because high watermark offset is the next incoming offset and not the last
+                # one in the topic partition
+                highest = watermarks[:high] - partition_range.first - 1
+                # This range represents offsets we want to fetch
+                [partition_id, lowest..highest]
+              end.to_h
+              # We start on our topic from the lowest offset for each expected partition
+              iterator = Karafka::Pro::Iterator.new(
+                { topic_id => ranges.transform_values(&:first) }
+              )
+              # Build the aggregated representation for each partition messages, so we can start
+              # with assumption that all the topics are fully compacted. Then we can nicely replace
+              # compacted `false` data with real messages, effectively ensuring that the gaps are
+              # filled with `false` out-of-the-box
+              aggregated = Hash.new { |h, k| h[k] = {} }
+              # We initialize the hash so we have a constant ascending order based on the partition
+              # number
+              partitions_ids.each { |i| aggregated[i] }
+              # We prefill all the potential offsets for each partition, so in case they were
+              # compacted, we get a continuous flow
+              ranges.each do |partition, range|
+                partition_aggr = aggregated[partition]
+                range.each { |i| partition_aggr[i] = [partition, i] }
+              end
+              # Iterate over all partitions and collect data
+              iterator.each do |message|
+                range = ranges[message.partition]
+                # Do not fetch more data from a partition for which we got last message from the
+                # expected offsets
+                # When all partitions are stopped, we will stop operations. This drastically
+                # improves performance because we no longer have to poll nils
+                iterator.stop_current_partition if message.offset >= range.last
+                partition = aggregated[message.partition]
+                partition[message.offset] = message
+              end
+              [
+                aggregated.values.map(&:values).map(&:reverse).reduce(:+),
+                !Sets.call(counts, page + 1).empty?
+              ]
+            end
             private
             # @param args [Object] anything required by the admin `#read_topic`
@@ -117,16 +202,6 @@ module Karafka
               raise
             end
-            # @param topic_id [String] id of the topic
-            # @return [Integer] number of partitions this topic has
-            def fetch_partition_count(topic_id)
-              ::Karafka::Admin
-                .cluster_info
-                .topics
-                .find { |topic| topic[:topic_name] == topic_id }
-                .fetch(:partition_count)
-            end
             # @return [Integer] elements per page
             def per_page
               ::Karafka::Web.config.ui.per_page
@@ -136,17 +211,26 @@ module Karafka
             # we need to fill those with  just the missing offset and handle this on the UI.
             #
             # @param messages [Array<Karafka::Messages::Message>] selected messages
+            # @param partition_id [Integer] number of partition for which we fill message gap
             # @param start_offset [Integer] offset of the first message (lowest) that we received
             # @param count [Integer] how many messages we wanted - we need that to fill spots to
             #   have exactly the number that was  requested and not more
+            # @param high_offset [Integer] high watermark offset
             # @return [Array<Karafka::Messages::Message, Integer>] array with gaps filled with the
             #   missing offset
-            def fill_compacted(messages, start_offset, count)
-              Array.new(count) do |index|
+            def fill_compacted(messages, partition_id, start_offset, count, high_offset)
+              filled = Array.new(count) do |index|
                 messages.find do |message|
                   (message.offset - start_offset) == index
-                end || start_offset + index
+                end || [partition_id, start_offset + index]
               end
+              # Remove dummies provisioned over the high offset
+              filled.delete_if do |message|
+                message.is_a?(Array) && message.last >= high_offset
+              end
+              filled
             end
           end
         end

data/lib/karafka/web/ui/models/metrics/aggregated.rb ADDED Viewed

@@ -0,0 +1,196 @@
+# frozen_string_literal: true
+module Karafka
+  module Web
+    module Ui
+      module Models
+        # Namespace for metrics related models
+        module Metrics
+          # Materializes the aggregated data and computes the expected diffs out of the snapshots
+          # We do some pre-processing to make sure, we do not have bigger gaps and to compensate
+          # for reporting drifting
+          class Aggregated < Lib::HashProxy
+            include ::Karafka::Core::Helpers::Time
+            # If samples are closer than that, sample will be rejected
+            MIN_ACCEPTED_DRIFT = 4
+            # If samples are further away than that, we will inject an artificial sample in-between
+            MAX_ACCEPTED_DRIFT = 7
+            # For which keys we should compute the delta in reference to the previous period
+            # Metrics we get from the processes are always absolute, hence we need a reference point
+            # to compute the deltas
+            #
+            # If at least two elements do not exist for given delta range, we keep it empty
+            DELTA_KEYS = %i[
+              batches
+              messages
+              errors
+              retries
+              dead
+            ].freeze
+            private_constant :MIN_ACCEPTED_DRIFT, :MAX_ACCEPTED_DRIFT, :DELTA_KEYS
+            # Builds the Web-UI historicals representation that includes deltas
+            #
+            # @param aggregated [Hash] aggregated historical metrics
+            def initialize(aggregated)
+              aggregated
+                .tap { |historicals| reject_drifters(historicals) }
+                .tap { |historicals| fill_gaps(historicals) }
+                .then { |historicals| enrich_with_deltas(historicals) }
+                .tap { |historicals| enrich_with_batch_size(historicals) }
+                .tap { |historicals| enrich_with_process_rss(historicals) }
+                .then { |enriched| super(enriched) }
+            end
+            # @return [Boolean] do we have enough data to draw any basic charts
+            def sufficient?
+              seconds.size > 2
+            end
+            private
+            # Since our reporting is not ms precise, there are cases where sampling can drift.
+            # If drifting gets us close to one side, for delta metrics it would create sudden
+            # artificial drops in metrics that would not match the reality. We reject drifters like
+            # this as we can compensate this later.
+            #
+            # This problems only affects our near real-time metrics with seconds precision
+            #
+            # @param historicals [Hash] all historicals for all the ranges
+            def reject_drifters(historicals)
+              initial = nil
+              historicals.fetch(:seconds).delete_if do |sample|
+                unless initial
+                  initial = sample.first
+                  next
+                end
+                # Reject values that are closer than minimum
+                too_close = sample.first - initial < MIN_ACCEPTED_DRIFT
+                initial = sample.first
+                too_close
+              end
+            end
+            # In case of a positive drift, we may have gaps bigger than few seconds in reporting.
+            # This can create a false sense of spikes that do not reflect the reality. We compensate
+            # this by extrapolating the delta values and using the rest as they are.
+            #
+            # This problems only affects our near real-time metrics with seconds precision
+            #
+            # @param historicals [Hash] all historicals for all the ranges
+            def fill_gaps(historicals)
+              filled = []
+              previous = nil
+              historicals.fetch(:seconds).each do |sample|
+                unless previous
+                  filled << sample
+                  previous = sample
+                  next
+                end
+                if sample.first - previous.first > MAX_ACCEPTED_DRIFT
+                  base = sample.last.dup
+                  DELTA_KEYS.each do |key|
+                    base[key] = previous.last[key] + (sample.last[key] - previous.last[key]) / 2
+                  end
+                  filled << [previous.first + (sample.first - previous.first) / 2, base]
+                end
+                filled << sample
+                previous = sample
+              end
+              historicals[:seconds] = filled
+            end
+            # Takes the historical hash, iterates over all the samples and enriches them with the
+            # delta computed values
+            #
+            # @param historicals [Hash] all historicals for all the ranges
+            # @return [Hash] historicals with delta based data
+            def enrich_with_deltas(historicals)
+              results = {}
+              historicals.each do |range, time_samples|
+                results[range] = []
+                baseline = nil
+                time_samples.each do |time_sample|
+                  metrics = time_sample[1]
+                  if baseline
+                    deltas = compute_deltas(baseline, metrics)
+                    results[range] << [time_sample[0], metrics.merge(deltas)]
+                  end
+                  baseline = metrics
+                end
+              end
+              results
+            end
+            # Batch size is a match between number of messages and number of batches
+            # It is derived out of the data we have so we compute it on the fly
+            # @param historicals [Hash] all historicals for all the ranges
+            def enrich_with_batch_size(historicals)
+              historicals.each_value do |time_samples|
+                time_samples.each do |time_sample|
+                  metrics = time_sample[1]
+                  batches = metrics[:batches]
+                  # We check if not zero just in case something would be off there
+                  # We do not want to divide by zero
+                  metrics[:batch_size] = batches.zero? ? 0 : metrics[:messages] / batches
+                end
+              end
+            end
+            # Adds an average RSS on a per process basis
+            # @param historicals [Hash] all historicals for all the ranges
+            def enrich_with_process_rss(historicals)
+              historicals.each_value do |time_samples|
+                time_samples.each do |time_sample|
+                  metrics = time_sample[1]
+                  rss = metrics[:rss]
+                  processes = metrics[:processes]
+                  metrics[:process_rss] = processes.zero? ? 0 : rss / processes
+                end
+              end
+            end
+            # Computes deltas for all the relevant keys for which we want to have deltas
+            #
+            # @param previous [Hash]
+            # @param current [Hash]
+            # @return [Hash] delta computed values
+            def compute_deltas(previous, current)
+              DELTA_KEYS.map do |delta_key|
+                [
+                  delta_key,
+                  current.fetch(delta_key) - previous.fetch(delta_key)
+                ]
+              end.to_h
+            end
+          end
+        end
+      end
+    end
+  end
+end

data/lib/karafka/web/ui/models/metrics/charts/aggregated.rb ADDED Viewed

@@ -0,0 +1,50 @@
+# frozen_string_literal: true
+module Karafka
+  module Web
+    module Ui
+      module Models
+        module Metrics
+          # Namespace for models related to presentation of charts
+          module Charts
+            # Model for formatting aggregated metrics data for charts
+            class Aggregated < Lib::HashProxy
+              # @param aggregated [Hash] all aggregated for all periods
+              # @param period [Symbol] period that we are interested in
+              def initialize(aggregated, period)
+                @data = aggregated.to_h.fetch(period)
+              end
+              # @param args [Array<String>] names of aggregated we want to show
+              # @return [String] JSON with data about all the charts we were interested in
+              def with(*args)
+                args
+                  .map { |name| [name.to_sym, public_send(name)] }
+                  .to_h
+                  .to_json
+              end
+              # @param method_name [String]
+              # @param include_private [Boolean]
+              def respond_to_missing?(method_name, include_private = false)
+                @data.last.last.key?(method_name.to_sym) || super
+              end
+              # Handles delegation to fetch appropriate historical metrics based on their name
+              #
+              # @param method_name [String]
+              # @param arguments [Array] missing method call arguments
+              def method_missing(method_name, *arguments)
+                if @data.last.last.key?(method_name.to_sym)
+                  @data.map { |a| [a.first, a.last[method_name]] }
+                else
+                  super
+                end
+              end
+            end
+          end
+        end
+      end
+    end
+  end
+end

data/lib/karafka/web/ui/models/metrics/charts/topics.rb ADDED Viewed

@@ -0,0 +1,109 @@
+# frozen_string_literal: true
+module Karafka
+  module Web
+    module Ui
+      module Models
+        module Metrics
+          module Charts
+            # Model for preparing data about topics states
+            class Topics
+              # @param topics_data [Hash] topics aggregated metrics data
+              # @param period [Symbol] period that we are interested in
+              def initialize(topics_data, period)
+                @data = topics_data.to_h.fetch(period)
+              end
+              # @return [String] JSON with lags of each of the topics + total lag of all the topics
+              #   from all the consumer groups.
+              def lags_stored
+                total = Hash.new { |h, v| h[v] = 0 }
+                @data.to_h.each_value do |metrics|
+                  metrics.each do |metric|
+                    time = metric.first
+                    lag_stored = metric.last[:lag_stored]
+                    if lag_stored
+                      total[time] ||= 0
+                      total[time] += lag_stored
+                    else
+                      next if total.key?(time)
+                      total[time] = nil
+                    end
+                  end
+                end
+                # Extract the lag stored only from all the data
+                per_topic = @data.to_h.map do |topic, metrics|
+                  extracted = metrics.map { |metric| [metric.first, metric.last[:lag_stored]] }
+                  [topic, extracted]
+                end.to_h
+                # We name it with a space because someone may have a topic called "total" and we
+                # want to avoid collisions
+                per_topic.merge('total sum' => total.to_a).to_json
+              end
+              # @return [String] JSON with producers pace that represents high-watermarks sum for
+              #   each topic
+              def topics_pace
+                topics = {}
+                @data.to_h.each do |topic, metrics|
+                  topic_without_cg = topic.split('[').first
+                  # If we've already seen this topic data, we can skip
+                  next if topics.include?(topic_without_cg)
+                  topics[topic_without_cg] = metrics.map do |current|
+                    [current.first, current.last[:pace]]
+                  end
+                end
+                topics.each_value(&:compact!)
+                topics.to_json
+              end
+              # @return [String] JSON with per-topic, highest LSO freeze duration. Useful for
+              #   debugging of issues arising from hanging transactions
+              def max_lso_time
+                topics = Hash.new { |h, k| h[k] = Hash.new { |h2, k2| h2[k2] = [] } }
+                @data.to_h.each do |topic, metrics|
+                  topic_without_cg = topic.split('[').first
+                  metrics.each do |current|
+                    ls_offset = current.last[:ls_offset] || 0
+                    ls_offset_fd = current.last[:ls_offset_fd] || 0
+                    hi_offset = current.last[:hi_offset] || 0
+                    # We convert this to seconds from milliseconds due to our Web UI precision
+                    # Reporting is in ms for consistency
+                    normalized_fd = (ls_offset_fd / 1_000).round
+                    # In case ls_offset and hi_offset are the same, it means we're reached eof
+                    # and we just don't have more data. In cases like this, LSO freeze duration
+                    # will grow because LSO will remain unchanged, but it does not mean it is
+                    # frozen. It means there is just no more data in the topic partition
+                    # This means we need to nullify this case, otherwise it would report, that
+                    # lso is hanging.
+                    normalized_fd = 0 if ls_offset == hi_offset
+                    topics[topic_without_cg][current.first] << normalized_fd
+                  end
+                end
+                topics.each_value(&:compact!)
+                topics.each_value { |metrics| metrics.transform_values!(&:max) }
+                topics.transform_values! { |values| values.to_a.sort_by!(&:first) }
+                topics.to_json
+              end
+            end
+          end
+        end
+      end
+    end
+  end
+end

data/lib/karafka/web/ui/models/metrics/topics.rb ADDED Viewed

@@ -0,0 +1,101 @@
+# frozen_string_literal: true
+module Karafka
+  module Web
+    module Ui
+      module Models
+        module Metrics
+          # Representation of topics historical metrics based on the aggregated metrics data
+          # We do some pre-processing to align and normalize all the data
+          class Topics < Lib::HashProxy
+            # @param consumers_groups [Hash] historical metrics for consumers groups
+            def initialize(consumers_groups)
+              aggregate_topics_data(consumers_groups)
+                .tap { |topics_metrics| nulify_gaps(topics_metrics) }
+                .then { |topics_metrics| super(topics_metrics) }
+            end
+            private
+            # Extracts and aggregates data on a per-topic basis in a hash. Because in theory same
+            # topic can be consumed by multiple consumer groups, we include consumer group in the
+            # hash keys.
+            #
+            # @param consumers_groups [Hash] consumers groups initial hash with metrics
+            # @return [Hash] remapped hash with range including extracted topics details
+            def aggregate_topics_data(consumers_groups)
+              extracted = Hash.new { |h, k| h[k] = [] }
+              consumers_groups.each do |range, samples|
+                range_extracted = {}
+                samples.each do |sample|
+                  time = sample.first
+                  groups = sample.last
+                  groups.each do |cg_name, topics|
+                    topics.each do |topic_name, topic_data|
+                      range_extracted["#{topic_name}[#{cg_name}]"] ||= []
+                      range_extracted["#{topic_name}[#{cg_name}]"] << [time, topic_data]
+                    end
+                  end
+                end
+                # Always align the order of topics in hash based on their name so it is
+                # independent from the reported order
+                extracted[range] = range_extracted.keys.sort.map do |key|
+                  [key, range_extracted[key]]
+                end.to_h
+              end
+              extracted
+            end
+            # Nullifies gaps within data with metrics with nil values. This is needed for us to be
+            # able to provide consistent charts even with gaps in reporting.
+            #
+            # @param topics_metrics [Hash] flattened topics data
+            # @note This modifies the original data in place
+            # @note We nullify both gaps in metrics as well as gaps in times (no values for time)
+            def nulify_gaps(topics_metrics)
+              # Hash with all potential keys that a single sample metric can have
+              # This allows us to fill gaps not only in times but also in values
+              base_samples = topics_metrics
+                             .values
+                             .map(&:values)
+                             .flatten
+                             .select { |val| val.is_a?(Hash) }
+                             .flat_map(&:keys)
+                             .uniq
+                             .map { |key| [key, nil] }
+                             .to_h
+                             .freeze
+              # Normalize data in between topics reportings
+              # One topic may have a sample in a time moment when a different one does not
+              topics_metrics.each_value do |samples|
+                # All available times from all the topics
+                times = samples.values.map { |set| set.map(&:first) }.flatten.uniq
+                samples.each_value do |set|
+                  times.each do |time|
+                    existing_index = set.find_index { |existing_time, _| existing_time == time }
+                    if existing_index
+                      existing_value = set[existing_index][1]
+                      set[existing_index][1] = base_samples.merge(existing_value)
+                    else
+                      set << [time, base_samples]
+                    end
+                  end
+                  set.sort_by!(&:first)
+                end
+              end
+            end
+          end
+        end
+      end
+    end
+  end
+end