RubyGems - elasticgraph-support - Versions diffs - 0.18.0.0 - Mend

elasticgraph-support 0.18.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

checksums.yaml +7 -0
data/LICENSE.txt +21 -0
data/README.md +6 -0
data/elasticgraph-support.gemspec +16 -0
data/lib/elastic_graph/constants.rb +220 -0
data/lib/elastic_graph/error.rb +99 -0
data/lib/elastic_graph/support/faraday_middleware/msearch_using_get_instead_of_post.rb +31 -0
data/lib/elastic_graph/support/faraday_middleware/support_timeouts.rb +36 -0
data/lib/elastic_graph/support/from_yaml_file.rb +53 -0
data/lib/elastic_graph/support/graphql_formatter.rb +66 -0
data/lib/elastic_graph/support/hash_util.rb +191 -0
data/lib/elastic_graph/support/logger.rb +82 -0
data/lib/elastic_graph/support/memoizable_data.rb +147 -0
data/lib/elastic_graph/support/monotonic_clock.rb +20 -0
data/lib/elastic_graph/support/threading.rb +42 -0
data/lib/elastic_graph/support/time_set.rb +293 -0
data/lib/elastic_graph/support/time_util.rb +108 -0
data/lib/elastic_graph/support/untyped_encoder.rb +67 -0
data/lib/elastic_graph/version.rb +15 -0
metadata +256 -0

data/lib/elastic_graph/support/time_set.rb ADDED Viewed

@@ -0,0 +1,293 @@
+# Copyright 2024 Block, Inc.
+#
+# Use of this source code is governed by an MIT-style
+# license that can be found in the LICENSE file or at
+# https://opensource.org/licenses/MIT.
+#
+# frozen_string_literal: true
+module ElasticGraph
+  module Support
+    # Models a set of `::Time` objects, but does so using one or more `::Range` objects.
+    # This is done so that we can support unbounded sets (such as "all times after midnight
+    # on date X").
+    #
+    # Internally, this is a simple wrapper around a set of `::Range` objects. Those ranges take
+    # a few different forms:
+    #
+    # - ALL: a range with no bounds, which implicitly contains all `::Time`s. (It's like the
+    #   integer set from negative to positive infinity).
+    # - An open range: a range with only an upper or lower bound (but not the other).
+    # - A closed range: a range with an upper and lower bound.
+    # - An empty range: a range that contains no `::Time`s, by virtue of its bounds having no overlap.
+    class TimeSet < ::Data.define(:ranges)
+      # Factory method to construct a `TimeSet` using a range with the given bounds.
+      def self.of_range(gt: nil, gte: nil, lt: nil, lte: nil)
+        if gt && gte
+          raise ArgumentError, "TimeSet got two lower bounds, but can have only one (gt: #{gt.inspect}, gte: #{gte.inspect})"
+        end
+        if lt && lte
+          raise ArgumentError, "TimeSet got two upper bounds, but can have only one (lt: #{lt.inspect}, lte: #{lte.inspect})"
+        end
+        # To be able to leverage Ruby's Range class, we need to convert to the "inclusive" ("or equal")
+        # form. This cuts down on the number of test cases we need to write and also Ruby's range lets
+        # you control whether the end of a range is inclusive or exclusive, but doesn't let you control
+        # the beginning of the range.
+        #
+        # This is safe to do because our datastores only work with `::Time`s at millisecond granularity,
+        # so `> t` is equivalent to `>= (t + 1ms)` and `< t` is equivalent to `<= (t - 1ms)`.
+        lower_bound = gt&.+(CONSECUTIVE_TIME_INCREMENT) || gte
+        upper_bound = lt&.-(CONSECUTIVE_TIME_INCREMENT) || lte
+        of_range_objects(_ = [RangeFactory.build_non_empty(lower_bound, upper_bound)].compact)
+      end
+      # Factory method to construct a `TimeSet` from a collection of `::Time` objects.
+      # Internally we convert it to a set of `::Range` objects, one per unique time.
+      def self.of_times(times)
+        of_range_objects(times.map { |t| ::Range.new(t, t) })
+      end
+      # Factory method to construct a `TimeSet` from a previously built collection of
+      # ::Time ranges. Mostly used internally by `TimeSet` and in tests.
+      def self.of_range_objects(range_objects)
+        # Use our singleton EMPTY or ALL instances if we can to save on memory.
+        return EMPTY if range_objects.empty?
+        first_range = _ = range_objects.first
+        return ALL if first_range.begin.nil? && first_range.end.nil?
+        new(range_objects)
+      end
+      # Returns a new `TimeSet` containing `::Time`s common to this set and `other_set`.
+      def intersection(other_set)
+        # Here we rely on the distributive and commutative properties of set algebra:
+        #
+        # https://en.wikipedia.org/wiki/Algebra_of_sets
+        # A ∩ (B ∪ C) = (A ∩ B) ∪ (A ∩ C) (distributive property)
+        #       A ∩ B = B ∩ A             (commutative property)
+        #
+        # We can combine these properties to see how the intersection of sets of ranges would work:
+        #          (A₁ ∪ A₂)        ∩        (B₁ ∪ B₂)
+        # =        ((A₁ ∪ A₂) ∩ B₁) ∪ ((A₁ ∪ A₂) ∩ B₂)        (expanding based on distributive property)
+        # =        (B₁ ∩ (A₁ ∪ A₂)) ∪ (B₂ ∩ (A₁ ∪ A₂))        (rearranging based on commutative property)
+        # = ((B₁ ∩ A₁) ∪ (B₁ ∩ A₂)) ∪ ((B₂ ∩ A₁) ∪ (B₂ ∩ A₂)) (expanding based on distributive property)
+        # =  (B₁ ∩ A₁) ∪ (B₁ ∩ A₂)  ∪  (B₂ ∩ A₁) ∪ (B₂ ∩ A₂)  (removing excess parens)
+        # = union of (intersection of each pair)
+        intersected_ranges = ranges.to_a.product(other_set.ranges.to_a)
+          .filter_map { |r1, r2| intersect_ranges(r1, r2) }
+        TimeSet.of_range_objects(intersected_ranges)
+      end
+      # Returns a new `TimeSet` containing `::Time`s that are in either this set or `other_set`.
+      def union(other_set)
+        TimeSet.of_range_objects(ranges.union(other_set.ranges))
+      end
+      # Returns true if the given `::Time` is a member of this `TimeSet`.
+      def member?(time)
+        ranges.any? { |r| r.cover?(time) }
+      end
+      # Returns true if this `TimeSet` and the given one have a least one time in common.
+      def intersect?(other_set)
+        other_set.ranges.any? do |r1|
+          ranges.any? do |r2|
+            ranges_intersect?(r1, r2)
+          end
+        end
+      end
+      # Returns true if this TimeSet contains no members.
+      def empty?
+        ranges.empty?
+      end
+      # Returns a new `TimeSet` containing the difference between this `TimeSet` and the given one.
+      def -(other)
+        new_ranges = other.ranges.to_a.reduce(ranges.to_a) do |accum, other_range|
+          accum.flat_map do |self_range|
+            if ranges_intersect?(self_range, other_range)
+              # Since the ranges intersect, `self_range` must be reduced some how. Depending on what kind of
+              # intersection we have (e.g. exact equality, `self_range` fully inside `other_range`, `other_range`
+              # fully inside `self_range`, partial overlap where `self_range` begins before `other_range`, or partial
+              # overlap where `self_range` ends after `other_range`), we may have a part of `self_range` that comes
+              # before `other_range`, a part of `self_range` that comes after `other_range`, both, or neither. Below
+              # we build the before and after parts as candidates, but then ignore any resulting ranges that are
+              # invalid, which leaves us with the correct result, without having to explicitly handle each possible case.
+              # @type var candidates: ::Array[timeRange]
+              candidates = []
+              if (other_range_begin = other_range.begin)
+                # This represents the parts of `self_range` that come _before_ `other_range`.
+                candidates << Range.new(self_range.begin, other_range_begin - CONSECUTIVE_TIME_INCREMENT)
+              end
+              if (other_range_end = other_range.end)
+                # This represents the parts of `self_range` that come _after_ `other_range`.
+                candidates << Range.new(other_range_end + CONSECUTIVE_TIME_INCREMENT, self_range.end)
+              end
+              # While some of the ranges produced above may be invalid (due to being descending), we don't have to
+              # filter them out here because `#initialize` takes care of it.
+              candidates
+            else
+              # Since the ranges don't intersect, there is nothing to remove from `self_range`; just return it unmodified.
+              [self_range]
+            end
+          end
+        end
+        TimeSet.of_range_objects(new_ranges)
+      end
+      def negate
+        ALL - self
+      end
+      private
+      private_class_method :new # use `of_range`, `of_times`, or `of_range_objects` instead.
+      # To ensure immutability, we override this to freeze the set. For convenience, we allow the `ranges`
+      # arg to be an array, and convert to a set here. In addition, we take care of normalizing to the most
+      # optimal form by merging overlapping ranges here, and ignore descending ranges.
+      def initialize(ranges:)
+        normalized_ranges = ranges
+          .reject { |r| descending_range?(r) }
+          .to_set
+          .then { |rs| merge_overlapping_or_adjacent_ranges(rs) }
+          .freeze
+        super(ranges: normalized_ranges)
+      end
+      # Returns true if at least one ::Time exists in both ranges.
+      def ranges_intersect?(r1, r2)
+        r1.cover?(r2.begin) || r1.cover?(r2.end) || r2.cover?(r1.begin) || r2.cover?(r1.end)
+      end
+      # The amount to add to a time to get the next consecutive time, based
+      # on the level of granularity we support. According to the Elasticsearch docs[1],
+      # it only supports millisecond granularity, so that's all we support:
+      #
+      # > Internally, dates are converted to UTC (if the time-zone is specified) and
+      # > stored as a long number representing milliseconds-since-the-epoch.
+      #
+      # We want exact precision here, so we are avoiding using a float for this, preferring
+      # to use a rational instead.
+      #
+      # [1] https://www.elastic.co/guide/en/elasticsearch/reference/7.15/date.html
+      CONSECUTIVE_TIME_INCREMENT = Rational(1, 1000)
+      # Returns true if the given ranges are adjacent with no room for any ::Time
+      # objects to exist between the ranges given the millisecond granularity we operate at.
+      def adjacent?(r1, r2)
+        r1.end&.+(CONSECUTIVE_TIME_INCREMENT)&.==(r2.begin) || r2.end&.+(CONSECUTIVE_TIME_INCREMENT)&.==(r1.begin) || false
+      end
+      # Combines the given ranges into a new range that only contains the common subset of ::Time objects.
+      # Returns `nil` if there is no intersection.
+      def intersect_ranges(r1, r2)
+        RangeFactory.build_non_empty(
+          [r1.begin, r2.begin].compact.max,
+          [r1.end, r2.end].compact.min
+        )
+      end
+      # Helper method that attempts to merge the given set of ranges into an equivalent
+      # set that contains fewer ranges in it but covers the same set of ::Time objects.
+      # As an example, consider these two ranges:
+      #
+      # - 2020-05-01 to 2020-07-01
+      # - 2020-06-01 to 2020-08-01
+      #
+      # These two ranges can safely be merged into a single range of 2020-05-01 to 2020-08-01.
+      # Technically speaking, this is not required; we can just return a TimeSet containing
+      # multiple ranges. However, the goal of a TimeSet is to represent a set of Time objects
+      # as minimally as possible, and to that end it is useful to merge ranges when possible.
+      # While it adds a bit of complexity to merge ranges like this, it'll simplify future
+      # calculations involving a TimeSet.
+      def merge_overlapping_or_adjacent_ranges(all_ranges)
+        # We sometimes have to apply this merge algorithm multiple times in order to fully merge
+        # the ranges into their minimal form. For example, consider these three ranges:
+        #
+        # - 2020-05-01 to 2020-07-01
+        # - 2020-06-01 to 2020-09-01
+        # - 2020-08-01 to 2020-10-01
+        #
+        # Ultimately, we can merge these into a single range of 2020-05-01 to 2020-10-01, but
+        # our algorithm isn't able to do that in a single pass. On the first pass it'll produce
+        # two merged ranges (2020-05-01 to 2020-09-01 and 2020-06-01 to 2020-10-01); after we
+        # apply the algorithm again it is then able to produce the final merged range.
+        # Since we can't predict how many iterations it'll take, we loop here, and break as
+        # soon as there is no more progress to be made.
+        #
+        # While we can't predice how many iterations it'll take, we can put an upper bound on it:
+        # it should take no more than `all_ranges.size` times, because every iteration should shrink
+        # `all_ranges` by at least one element--if not, that iteration didn't make any progress
+        # (and we're done anyway).
+        all_ranges.size.times do
+          # Given our set of ranges, any range is potentially mergeable with any other range.
+          # Here we determine which pairs of ranges are mergeable.
+          mergeable_range_pairs = all_ranges.to_a.combination(2).select do |r1, r2|
+            ranges_intersect?(r1, r2) || adjacent?(r1, r2)
+          end
+          # If there are no mergeable pairs, we're done!
+          return all_ranges if mergeable_range_pairs.empty?
+          # For each pair of mergeable ranges, build a merged range.
+          merged_ranges = mergeable_range_pairs.filter_map do |r1, r2|
+            RangeFactory.build_non_empty(
+              nil_or(:min, from: [r1.begin, r2.begin]),
+              nil_or(:max, from: [r1.end, r2.end])
+            )
+          end
+          # Update `all_ranges` based on the merges performed so far.
+          unmergeable_ranges = all_ranges - mergeable_range_pairs.flatten
+          all_ranges = unmergeable_ranges.union(_ = merged_ranges)
+        end
+        all_ranges
+      end
+      # Helper method for `merge_overlapping_or_adjacent_ranges` used to return the most "lenient" range boundary value.
+      # `nil` is used for a beginless or endless range, so we return that if available; otherwise
+      # we apply `min_or_max`.`
+      def nil_or(min_or_max, from:)
+        return nil if from.include?(nil)
+        from.public_send(min_or_max)
+      end
+      def descending_range?(range)
+        # If either edge is `nil` it cannot be descending.
+        return false if (range_begin = range.begin).nil?
+        return false if (range_end = range.end).nil?
+        # Otherwise we just compare the edges to determine if it's descending.
+        range_begin > range_end
+      end
+      # An instance in which all `::Time`s fit.
+      ALL = new([::Range.new(nil, nil)])
+      # Singleton instance that's empty.
+      EMPTY = new([])
+      module RangeFactory
+        # Helper method for building a range from the given bounds. Returns either
+        # a built range, or, if the given bounds produce an empty range, returns nil.
+        def self.build_non_empty(lower_bound, upper_bound)
+          if lower_bound.nil? || upper_bound.nil? || lower_bound <= upper_bound
+            ::Range.new(lower_bound, upper_bound)
+          end
+        end
+      end
+    end
+  end
+end

data/lib/elastic_graph/support/time_util.rb ADDED Viewed

@@ -0,0 +1,108 @@
+# Copyright 2024 Block, Inc.
+#
+# Use of this source code is governed by an MIT-style
+# license that can be found in the LICENSE file or at
+# https://opensource.org/licenses/MIT.
+#
+# frozen_string_literal: true
+module ElasticGraph
+  module Support
+    module TimeUtil
+      NANOS_PER_SECOND = 1_000_000_000
+      NANOS_PER_MINUTE = NANOS_PER_SECOND * 60
+      NANOS_PER_HOUR = NANOS_PER_MINUTE * 60
+      # Simple helper function to convert a local time string (such as `03:45:12` or `12:30:43.756`)
+      # to an integer value between 0 and 24 * 60 * 60 * 1,000,000,000 - 1 representing the nano of day
+      # for the local time value.
+      #
+      # This is meant to match the behavior of Java's `LocalTime#toNanoOfDay()` API:
+      # https://docs.oracle.com/en/java/javase/17/docs/api/java.base/java/time/LocalTime.html#toNanoOfDay()
+      #
+      # This is specifically useful when we need to work with local time values in a script: by converting
+      # a local time parameter to nano-of-day, our script can more efficiently compare values, avoiding the
+      # need to parse the same local time parameters over and over again as it applies the script to each
+      # document.
+      #
+      # Note: this method assumes the given `local_time_string` is well-formed. You'll get an exception if
+      # you provide a malformed value, but no effort has been put into giving a clear error message. The
+      # caller is expected to have already validated that the `local_time_string` is formatted correctly.
+      def self.nano_of_day_from_local_time(local_time_string)
+        hours_str, minutes_str, full_seconds_str = local_time_string.split(":")
+        seconds_str, subseconds_str = (_ = full_seconds_str).split(".")
+        hours = Integer(_ = hours_str, 10)
+        minutes = Integer(_ = minutes_str, 10)
+        seconds = Integer(seconds_str, 10)
+        nanos = Integer(subseconds_str.to_s.ljust(9, "0"), 10)
+        (hours * NANOS_PER_HOUR) + (minutes * NANOS_PER_MINUTE) + (seconds * NANOS_PER_SECOND) + nanos
+      end
+      # Helper method for advancing time. Unfortunately, Ruby's core `Time` type does not directly support this.
+      # ActiveSupport (from rails) provides this functionality, but we don't depend on rails at all and don't
+      # want to add such a heavyweight dependency for such a small thing.
+      #
+      # Luckily, our needs are quite limited, which makes this a much simpler problem then a general purpose `time.advance(...)` API:
+      #
+      # - We only need to support year, month, day, and hour advances.
+      # - We only ever need to advance a single unit.
+      #
+      # This provides a simple, correct implementation for that constrained problem space.
+      def self.advance_one_unit(time, unit)
+        case unit
+        when :year
+          with_updated(time, year: time.year + 1)
+        when :month
+          maybe_next_month =
+            if time.month == 12
+              with_updated(time, year: time.year + 1, month: 1)
+            else
+              with_updated(time, month: time.month + 1)
+            end
+          # If the next month has fewer days than the month of `time`, then it can "spill over" to a day
+          # from the first week of the month following that. For example, if the date of `time` was 2021-01-31
+          # and we add a month, it attempts to go to `2021-02-31` but such a date doesn't exist--instead
+          # `maybe_next_month` will be on `2021-03-03` because of the overflow. Here we correct for that.
+          #
+          # Our assumption (which we believe to be correct) is that every time this happens, both of these are true:
+          # - `time.day` is near the end of its month
+          # - `maybe_next_month.day` is near the start of its month
+          #
+          # ...and furthermore, we do not believe there is any other case where `time.day` and `maybe_next_month.day` can differ.
+          if time.day > maybe_next_month.day
+            corrected_date = maybe_next_month.to_date - maybe_next_month.day
+            with_updated(time, year: corrected_date.year, month: corrected_date.month, day: corrected_date.day)
+          else
+            maybe_next_month
+          end
+        when :day
+          next_day = time.to_date + 1
+          with_updated(time, year: next_day.year, month: next_day.month, day: next_day.day)
+        when :hour
+          time + 3600
+        end
+      end
+      private_class_method def self.with_updated(time, year: time.year, month: time.month, day: time.day)
+        # UTC needs to be treated special here due to an oddity of Ruby's Time class:
+        #
+        # > Time.utc(2021, 12, 2, 12, 30, 30).iso8601
+        #  => "2021-12-02T12:30:30Z"
+        # > Time.new(2021, 12, 2, 12, 30, 30, 0).iso8601
+        #  => "2021-12-02T12:30:30+00:00"
+        #
+        # We want to preserve the `Z` suffix on the ISO8601 representation of the advanced time
+        # (if it was there on the original time), so we use the `::Time.utc` method here to do that.
+        # Non-UTC time must use `::Time.new(...)` with a UTC offset, though.
+        if time.utc?
+          ::Time.utc(year, month, day, time.hour, time.min, time.sec.to_r + time.subsec)
+        else
+          ::Time.new(year, month, day, time.hour, time.min, time.sec.to_r + time.subsec, time.utc_offset)
+        end
+      end
+    end
+  end
+end

data/lib/elastic_graph/support/untyped_encoder.rb ADDED Viewed

@@ -0,0 +1,67 @@
+# Copyright 2024 Block, Inc.
+#
+# Use of this source code is governed by an MIT-style
+# license that can be found in the LICENSE file or at
+# https://opensource.org/licenses/MIT.
+#
+# frozen_string_literal: true
+require "json"
+module ElasticGraph
+  module Support
+    # Responsible for encoding `Untyped` values into strings. This logic lives here in `elasticgraph-support`
+    # so that it can be shared between the `Untyped` indexing preparer (which lives in `elasticgraph-indexer`)
+    # and the `Untyped` coercion adapter (which lives in `elasticgraph-graphql`). It is important that these
+    # share the same logic so that the string values we attempt to filter on at query time match the string values
+    # we indexed when given the semantically equivalent untyped data.
+    #
+    # Note: change this class with care. Changing the behavior to make `encode` produce different strings may result
+    # in breaking queries if the `Untyped`s stored in the index were indexed using previous encoding logic.
+    # A backfill into the datastore will likely be required to avoid this issue.
+    module UntypedEncoder
+      # Encodes the given untyped value to a String so it can be indexed in a Elasticsearch/OpenSearch `keyword` field.
+      def self.encode(value)
+        return nil if value.nil?
+        # Note: we use `fast_generate` here instead of `generate`. They basically act the same, except
+        # `generate` includes an extra check for self-referential data structures. `value` here ultimately
+        # comes out of a parsed JSON document (e.g. either from an ElasticGraph event at indexing time, or
+        # as a GraphQL query variable at search time), and JSON cannot express self-referential data
+        # structures, so we do not have to worry about that happening.
+        #
+        # ...but even if it did, we would get an error either way: `JSON.generate` would raise
+        # `JSON::NestingError` whereas `:JSON.fast_generate` would give us a `SystemStackError`.
+        ::JSON.fast_generate(canonicalize(value))
+      end
+      # Decodes a previously encoded Untyped value, returning its original value.
+      def self.decode(string)
+        return nil if string.nil?
+        ::JSON.parse(string)
+      end
+      # Helper method that converts `value` to a canonical form before we dump it as JSON.
+      # We do this because we index each JSON value as a `keyword` in the index, and we want
+      # equality filters on a JSON value field to consider equivalent JSON objects to be equal
+      # even if their normally generated JSON is not the same. For example, we want ElasticGraph
+      # to treat these two as being equivalent:
+      #
+      # {"a": 1, "b": 2} vs {"b": 2, "a": 1}
+      #
+      # To achieve this, we ensure JSON objects are generated in sorted order, and we use this same
+      # logic both at indexing time and also at query time when we are filtering.
+      private_class_method def self.canonicalize(value)
+        case value
+        when ::Hash
+          value
+            .sort_by { |k, v| k.to_s }
+            .to_h { |k, v| [k, canonicalize(v)] }
+        when ::Array
+          value.map { |v| canonicalize(v) }
+        else
+          value
+        end
+      end
+    end
+  end
+end

data/lib/elastic_graph/version.rb ADDED Viewed

@@ -0,0 +1,15 @@
+# Copyright 2024 Block, Inc.
+#
+# Use of this source code is governed by an MIT-style
+# license that can be found in the LICENSE file or at
+# https://opensource.org/licenses/MIT.
+#
+# frozen_string_literal: true
+module ElasticGraph
+  # The version of all ElasticGraph gems.
+  VERSION = "0.18.0.0"
+  # Steep weirdly expects this here...
+  # @dynamic self.define_schema
+end