RubyGems - nose - Versions diffs - 0.1.0pre - Mend

nose 0.1.0pre

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (55) hide show

checksums.yaml +7 -0
data/lib/nose/backend/cassandra.rb +390 -0
data/lib/nose/backend/file.rb +185 -0
data/lib/nose/backend/mongo.rb +242 -0
data/lib/nose/backend.rb +557 -0
data/lib/nose/cost/cassandra.rb +33 -0
data/lib/nose/cost/entity_count.rb +27 -0
data/lib/nose/cost/field_size.rb +31 -0
data/lib/nose/cost/request_count.rb +32 -0
data/lib/nose/cost.rb +68 -0
data/lib/nose/debug.rb +45 -0
data/lib/nose/enumerator.rb +199 -0
data/lib/nose/indexes.rb +239 -0
data/lib/nose/loader/csv.rb +99 -0
data/lib/nose/loader/mysql.rb +199 -0
data/lib/nose/loader/random.rb +48 -0
data/lib/nose/loader/sql.rb +105 -0
data/lib/nose/loader.rb +38 -0
data/lib/nose/model/entity.rb +136 -0
data/lib/nose/model/fields.rb +293 -0
data/lib/nose/model.rb +113 -0
data/lib/nose/parser.rb +202 -0
data/lib/nose/plans/execution_plan.rb +282 -0
data/lib/nose/plans/filter.rb +99 -0
data/lib/nose/plans/index_lookup.rb +302 -0
data/lib/nose/plans/limit.rb +42 -0
data/lib/nose/plans/query_planner.rb +361 -0
data/lib/nose/plans/sort.rb +49 -0
data/lib/nose/plans/update.rb +60 -0
data/lib/nose/plans/update_planner.rb +270 -0
data/lib/nose/plans.rb +135 -0
data/lib/nose/proxy/mysql.rb +275 -0
data/lib/nose/proxy.rb +102 -0
data/lib/nose/query_graph.rb +481 -0
data/lib/nose/random/barbasi_albert.rb +48 -0
data/lib/nose/random/watts_strogatz.rb +50 -0
data/lib/nose/random.rb +391 -0
data/lib/nose/schema.rb +89 -0
data/lib/nose/search/constraints.rb +143 -0
data/lib/nose/search/problem.rb +328 -0
data/lib/nose/search/results.rb +200 -0
data/lib/nose/search.rb +266 -0
data/lib/nose/serialize.rb +747 -0
data/lib/nose/statements/connection.rb +160 -0
data/lib/nose/statements/delete.rb +83 -0
data/lib/nose/statements/insert.rb +146 -0
data/lib/nose/statements/query.rb +161 -0
data/lib/nose/statements/update.rb +101 -0
data/lib/nose/statements.rb +645 -0
data/lib/nose/timing.rb +79 -0
data/lib/nose/util.rb +305 -0
data/lib/nose/workload.rb +244 -0
data/lib/nose.rb +37 -0
data/templates/workload.erb +42 -0
metadata +700 -0

data/lib/nose/cost.rb ADDED Viewed

@@ -0,0 +1,68 @@
+# frozen_string_literal: true
+module NoSE
+  # Cost models for steps of backend statement excution
+  module Cost
+    # Cost model for a backend database
+    class Cost
+      include Supertype
+      def initialize(**options)
+        @options = options
+      end
+      # The cost of filtering intermediate results
+      # @return [Fixnum]
+      def filter_cost(_step)
+        # Assume this has no cost and the cost is captured in the fact that we
+        # have to retrieve more data earlier. All this does is skip records.
+        0
+      end
+      # The cost of limiting a result set
+      # @return [Fixnum]
+      def limit_cost(_step)
+        # This is basically free since we just discard data
+        0
+      end
+      # The cost of sorting a set of results
+      # @return [Fixnum]
+      def sort_cost(_step)
+        # TODO: Find some estimate of sort cost
+        #       This could be partially captured by the fact that sort + limit
+        #       effectively removes the limit
+        1
+      end
+      # The cost of performing a lookup via an index
+      # @return [Fixnum]
+      def index_lookup_cost(_step)
+        fail NotImplementedError, 'Must be implemented in a subclass'
+      end
+      # The cost of performing a deletion from an index
+      # @return [Fixnum]
+      def delete_cost(_step)
+        fail NotImplementedError, 'Must be implemented in a subclass'
+      end
+      # The cost of performing an insert into an index
+      # @return [Fixnum]
+      def insert_cost(_step)
+        fail NotImplementedError, 'Must be implemented in a subclass'
+      end
+      # This is here for debugging purposes because we need a cost
+      # @return [Fixnum]
+      def pruned_cost(_step)
+        0
+      end
+    end
+  end
+end
+require_relative 'cost/cassandra'
+require_relative 'cost/entity_count'
+require_relative 'cost/field_size'
+require_relative 'cost/request_count'

data/lib/nose/debug.rb ADDED Viewed

@@ -0,0 +1,45 @@
+# frozen_string_literal: true
+# rubocop:disable Lint/HandleExceptions
+begin
+  require 'binding_of_caller'
+  require 'pry'
+rescue LoadError
+  # Ignore in case we are not in development mode
+end
+# rubocop:enable Lint/HandleExceptions
+module NoSE
+  # Various helpful debugging snippets
+  module Debug
+    # Convenience method to break in IndexLookupStep when
+    # a particular set of indexes is reach when planning
+    # @return [void]
+    def self.break_on_indexes(*index_keys)
+      apply = binding.of_caller(1)
+      parent = apply.eval 'parent'
+      index = apply.eval 'index'
+      current_keys = parent.parent_steps.indexes.map(&:key) << index.key
+      # rubocop:disable Lint/Debugger
+      binding.pry if current_keys == index_keys
+      # rubocop:enable Lint/Debugger
+    end
+    # Export entities in a model as global
+    # variales for easier access when debugging
+    # @return [void]
+    def self.export_model(model)
+      model.entities.each do |name, entity|
+        # rubocop:disable Lint/Eval
+        eval("$#{name} = entity")
+        # rubocop:enable Lint/Eval
+        entity.fields.merge(entity.foreign_keys).each do |field_name, field|
+          entity.define_singleton_method field_name.to_sym, -> { field }
+        end
+      end
+      nil
+    end
+  end
+end

data/lib/nose/enumerator.rb ADDED Viewed

@@ -0,0 +1,199 @@
+# frozen_string_literal: true
+require 'logging'
+module NoSE
+  # Produces potential indices to be used in schemas
+  class IndexEnumerator
+    def initialize(workload)
+      @logger = Logging.logger['nose::enumerator']
+      @workload = workload
+    end
+    # Produce all possible indices for a given query
+    # @return [Array<Index>]
+    def indexes_for_query(query)
+      @logger.debug "Enumerating indexes for query #{query.text}"
+      range = if query.range_field.nil?
+                query.order
+              else
+                [query.range_field] + query.order
+              end
+      eq = query.eq_fields.group_by(&:parent)
+      eq.default_proc = ->(*) { [] }
+      range = range.group_by(&:parent)
+      range.default_proc = ->(*) { [] }
+      query.graph.subgraphs.flat_map do |graph|
+        indexes_for_graph graph, query.select, eq, range
+      end.uniq << query.materialize_view
+    end
+    # Produce all possible indices for a given workload
+    # @return [Set<Index>]
+    def indexes_for_workload(additional_indexes = [], by_id_graph = false)
+      queries = @workload.queries
+      indexes = Parallel.map(queries) do |query|
+        indexes_for_query(query).to_a
+      end.inject(additional_indexes, &:+)
+      # Add indexes generated for support queries
+      supporting = support_indexes indexes, by_id_graph
+      supporting += support_indexes supporting, by_id_graph
+      indexes += supporting
+      # Deduplicate indexes, combine them and deduplicate again
+      indexes.uniq!
+      combine_indexes indexes
+      indexes.uniq!
+      @logger.debug do
+        "Indexes for workload:\n" + indexes.map.with_index do |index, i|
+          "#{i} #{index.inspect}"
+        end.join("\n")
+      end
+      indexes
+    end
+    private
+    # Produce the indexes necessary for support queries for these indexes
+    # @return [Array<Index>]
+    def support_indexes(indexes, by_id_graph)
+      # If indexes are grouped by ID graph, convert them before updating
+      # since other updates will be managed automatically by index maintenance
+      indexes = indexes.map(&:to_id_graph).uniq if by_id_graph
+      # Collect all possible support queries
+      queries = indexes.flat_map do |index|
+        @workload.updates.flat_map do |update|
+          update.support_queries(index)
+        end
+      end
+      # Enumerate indexes for each support query
+      queries.uniq!
+      queries.flat_map do |query|
+        indexes_for_query(query).to_a
+      end
+    end
+    # Combine the data of indices based on matching hash fields
+    def combine_indexes(indexes)
+      no_order_indexes = indexes.select do |index|
+        index.order_fields.empty?
+      end
+      no_order_indexes = no_order_indexes.group_by do |index|
+        [index.hash_fields, index.graph]
+      end
+      no_order_indexes.each do |(hash_fields, graph), hash_indexes|
+        extra_choices = hash_indexes.map(&:extra).uniq
+        # XXX More combos?
+        combos = extra_choices.combination(2)
+        combos.map do |combo|
+          indexes << Index.new(hash_fields, [], combo.inject(Set.new, &:+),
+                               graph)
+          @logger.debug "Enumerated combined index #{indexes.last.inspect}"
+        end
+      end
+    end
+    # Get all possible choices of fields to use for equality
+    # @return [Array<Array>]
+    def eq_choices(graph, eq)
+      entity_choices = graph.entities.flat_map do |entity|
+        # Get the fields for the entity and add in the IDs
+        entity_fields = eq[entity] << entity.id_field
+        entity_fields.uniq!
+        1.upto(entity_fields.count).flat_map do |n|
+          entity_fields.permutation(n).to_a
+        end
+      end
+      2.upto(graph.entities.length).flat_map do |n|
+        entity_choices.permutation(n).map(&:flatten).to_a
+      end + entity_choices
+    end
+    # Get fields which should be included in an index for the given graph
+    # @return [Array<Array>]
+    def extra_choices(graph, select, eq, range)
+      choices = eq.values + range.values << select.to_a
+      choices.each do |choice|
+        choice.select { |field| graph.entities.include?(field.parent) }
+      end
+      choices.reject(&:empty?) << []
+    end
+    # Get all possible indices which jump a given piece of a query graph
+    # @return [Array<Index>]
+    def indexes_for_graph(graph, select, eq, range)
+      eq_choices = eq_choices graph, eq
+      range_fields = graph.entities.map { |entity| range[entity] }.reduce(&:+)
+      range_fields.uniq!
+      order_choices = range_fields.prefixes.flat_map do |fields|
+        fields.permutation.to_a
+      end.uniq << []
+      extra_choices = extra_choices graph, select, eq, range
+      extra_choices = 1.upto(extra_choices.length).flat_map do |n|
+        extra_choices.combination(n).map(&:flatten).map(&:uniq)
+      end.uniq
+      # Generate all possible indices based on the field choices
+      choices = eq_choices.product(extra_choices)
+      indexes = choices.map! do |index, extra|
+        indexes = []
+        order_choices.each do |order|
+          # Append the primary key of the entities in the graph if needed
+          order += graph.entities.sort_by(&:name).map(&:id_field) -
+                   (index + order)
+          # Partition into the ordering portion
+          index.partitions.each do |index_prefix, order_prefix|
+            hash_fields = index_prefix.take_while do |field|
+              field.parent == index.first.parent
+            end
+            order_fields = index_prefix[hash_fields.length..-1] + \
+                           order_prefix + order
+            extra_fields = extra - hash_fields - order_fields
+            next if order_fields.empty? && extra_fields.empty?
+            new_index = generate_index hash_fields, order_fields, extra_fields,
+                                       graph
+            indexes << new_index unless new_index.nil?
+          end
+        end
+        indexes
+      end.inject([], &:+)
+      indexes.flatten!
+      indexes
+    end
+    # Generate a new index and ignore if invalid
+    # @return [Index]
+    def generate_index(hash, order, extra, graph)
+      begin
+        index = Index.new hash, order.uniq, extra, graph
+        @logger.debug { "Enumerated #{index.inspect}" }
+      rescue InvalidIndexException
+        # This combination of fields is not valid, that's ok
+        index = nil
+      end
+      index
+    end
+  end
+end

data/lib/nose/indexes.rb ADDED Viewed

@@ -0,0 +1,239 @@
+# frozen_string_literal: true
+module NoSE
+  # A representation of materialized views over fields in an entity
+  class Index
+    attr_reader :hash_fields, :order_fields, :extra, :all_fields, :path,
+                :entries, :entry_size, :size, :hash_count, :per_hash_count,
+                :graph
+    def initialize(hash_fields, order_fields, extra, graph, saved_key = nil)
+      order_set = order_fields.to_set
+      @hash_fields = hash_fields.to_set
+      @order_fields = order_fields.delete_if { |e| hash_fields.include? e }
+      @extra = extra.to_set.delete_if do |e|
+        @hash_fields.include?(e) || order_set.include?(e)
+      end
+      @all_fields = Set.new(@hash_fields).merge(order_set).merge(@extra)
+      validate_hash_fields
+      # Store whether this index is an identity
+      @identity = @hash_fields == [
+        @hash_fields.first.parent.id_field
+      ].to_set && graph.nodes.size == 1
+      @graph = graph
+      @path = graph.longest_path
+      @path = nil unless @path.length == graph.size
+      validate_graph
+      build_hash saved_key
+    end
+    # Check if this index maps from the primary key to fields from one entity
+    # @return [Boolean]
+    def identity?
+      @identity
+    end
+    # A simple key which uniquely identifies the index
+    # @return [String]
+    def key
+      @key ||= "i#{Zlib.crc32 hash_str}"
+    end
+    # Look up a field in the index based on its ID
+    # @return [Fields::Field]
+    def [](field_id)
+      @all_fields.find { |field| field.id == field_id }
+    end
+    # Check if this index is an ID graph
+    # @return [Boolean]
+    def id_graph?
+      @hash_fields.all?(&:primary_key?) && @order_fields.all?(&:primary_key)
+    end
+    # Produce an index with the same fields but keyed by entities in the graph
+    def to_id_graph
+      return self if id_graph?
+      all_ids = (@hash_fields.to_a + @order_fields + @extra.to_a)
+      all_ids.map! { |f| f.parent.id_field }.uniq!
+      hash_fields = [all_ids.first]
+      order_fields = all_ids[1..-1]
+      extra = @all_fields - hash_fields - order_fields
+      Index.new hash_fields, order_fields, extra, @graph
+    end
+    # :nocov:
+    def to_color
+      fields = [@hash_fields, @order_fields, @extra].map do |field_group|
+        '[' + field_group.map(&:inspect).join(', ') + ']'
+      end
+      "[magenta]#{key}[/] #{fields[0]} #{fields[1]} → #{fields[2]}" \
+        " [yellow]$#{size}[/]" \
+        " [magenta]#{@graph.inspect}[/]"
+    end
+    # :nocov:
+    # Two indices are equal if they contain the same fields
+    # @return [Boolean]
+    def ==(other)
+      hash == other.hash
+    end
+    alias eql? ==
+    # Hash based on the fields, their keys, and the graph
+    # @return [String]
+    def hash_str
+      @hash_str ||= [
+        @hash_fields.map(&:id).sort!,
+        @order_fields.map(&:id),
+        @extra.map(&:id).sort!,
+        @graph.unique_edges.map(&:canonical_params).sort!
+      ].to_s.freeze
+    end
+    def hash
+      @hash ||= Zlib.crc32 hash_str
+    end
+    # Check if the index contains a given field
+    # @return [Boolean]
+    def contains_field?(field)
+      @all_fields.include? field
+    end
+    private
+    # Initialize the hash function and freeze ourselves
+    # @return [void]
+    def build_hash(saved_key)
+      @key = saved_key
+      hash
+      key
+      calculate_size
+      freeze
+    end
+    # Check for valid hash fields in an index
+    # @return [void]
+    def validate_hash_fields
+      fail InvalidIndexException, 'hash fields cannot be empty' \
+        if @hash_fields.empty?
+      fail InvalidIndexException, 'hash fields can only involve one entity' \
+        if @hash_fields.map(&:parent).to_set.size > 1
+    end
+    # Ensure an index is nonempty
+    # @return [void]
+    def validate_nonempty
+      fail InvalidIndexException, 'must have fields other than hash fields' \
+        if @order_fields.empty? && @extra.empty?
+    end
+    # Ensure an index and its fields correspond to a valid graph
+    # @return [void]
+    def validate_graph
+      validate_graph_entities
+      validate_graph_keys
+    end
+    # Ensure the graph of the index is valid
+    # @return [void]
+    def validate_graph_entities
+      entities = @all_fields.map(&:parent).to_set
+      fail InvalidIndexException, 'graph entities do match index' \
+        unless entities == @graph.entities.to_set
+    end
+    # We must have the primary keys of the all entities in the graph
+    # @return [void]
+    def validate_graph_keys
+      fail InvalidIndexException, 'missing graph entity keys' \
+        unless @graph.entities.map(&:id_field).all? do |field|
+          @hash_fields.include?(field) || @order_fields.include?(field)
+        end
+    end
+    # Precalculate the size of the index
+    # @return [void]
+    def calculate_size
+      @hash_count = @hash_fields.product_by(&:cardinality)
+      # XXX This only works if foreign keys span all possible keys
+      #     Take the maximum possible count at each join and multiply
+      @entries = @graph.entities.map(&:count).max
+      @per_hash_count = (@entries * 1.0 / @hash_count)
+      @entry_size = @all_fields.sum_by(&:size)
+      @size = @entries * @entry_size
+    end
+  end
+  # Thrown when something tries to create an invalid index
+  class InvalidIndexException < StandardError
+  end
+  # Allow entities to create their own indices
+  class Entity
+    # Create a simple index which maps entity keys to other fields
+    # @return [Index]
+    def simple_index
+      Index.new [id_field], [], fields.values - [id_field],
+                QueryGraph::Graph.from_path([id_field]), name
+    end
+  end
+  # Allow statements to materialize views
+  class Statement
+    # Construct an index which acts as a materialized view for a query
+    # @return [Index]
+    def materialize_view
+      eq = materialized_view_eq join_order.first
+      order_fields = materialized_view_order(join_order.first) - eq
+      Index.new(eq, order_fields,
+                all_fields - (@eq_fields + @order).to_set, @graph)
+    end
+    private
+    # Get the fields used as parition keys for a materialized view
+    # based over a given entity
+    # @return [Array<Fields::Field>]
+    def materialized_view_eq(hash_entity)
+      eq = @eq_fields.select { |field| field.parent == hash_entity }
+      eq = [join_order.last.id_field] if eq.empty?
+      eq
+    end
+    # Get the ordered keys for a materialized view
+    # @return [Array<Fields::Field>]
+    def materialized_view_order(hash_entity)
+      # Start the ordered fields with the equality predicates
+      # on other entities, followed by all of the attributes
+      # used in ordering, then the range field
+      order_fields = @eq_fields.select do |field|
+        field.parent != hash_entity
+      end + @order
+      if @range_field && !@order.include?(@range_field)
+        order_fields << @range_field
+      end
+      # Ensure we include IDs of the final entity
+      order_fields += join_order.map(&:id_field)
+      order_fields.uniq
+    end
+  end
+end

data/lib/nose/loader/csv.rb ADDED Viewed

@@ -0,0 +1,99 @@
+# frozen_string_literal: true
+require 'formatador'
+require 'smarter_csv'
+require 'zlib'
+module NoSE
+  module Loader
+    # Load data into an index from a set of CSV files
+    class CsvLoader < LoaderBase
+      def initialize(workload = nil, backend = nil)
+        super
+        @logger = Logging.logger['nose::loader::csvloader']
+      end
+      # Load data for all the indexes
+      def load(indexes, config, show_progress = false, limit = nil,
+               skip_existing = true)
+        indexes.map!(&:to_id_graph).uniq! if @backend.by_id_graph
+        simple_indexes = find_simple_indexes indexes, skip_existing
+        simple_indexes.each do |entity, simple_index_list|
+          filename = File.join config[:directory], "#{entity.name}.csv"
+          total_rows = (limit || 0) - 1 # account for header row
+          File.open(filename) { |file| file.each_line { total_rows += 1 } }
+          progress = initialize_progress entity, simple_index_list,
+                                         total_rows if show_progress
+          load_file_indexes filename, entity, simple_index_list, progress
+        end
+      end
+      private
+      # Find the simple indexes we should populate
+      # @return [Hash<Entity, Index>]
+      def find_simple_indexes(indexes, skip_existing)
+        simple_indexes = indexes.select do |index|
+          index.graph.size == 1 &&
+            !(skip_existing && !@backend.index_empty?(index))
+        end
+        simple_indexes.group_by do |index|
+          index.hash_fields.first.parent
+        end
+      end
+      # Initialize a progress bar to reporting loading results
+      # @return [Formatador::ProgressBar]
+      def initialize_progress(entity, simple_index_list, total_rows)
+        @logger.info "Loading simple indexes for #{entity.name}"
+        @logger.info simple_index_list.map(&:key).join(', ')
+        Formatador.new.redisplay_progressbar 0, total_rows
+        Formatador::ProgressBar.new total_rows, started_at: Time.now.utc
+      end
+      # Load all indexes for a given file
+      # @return [void]
+      def load_file_indexes(filename, entity, simple_index_list, progress)
+        SmarterCSV.process(filename,
+                           downcase_header: false,
+                           chunk_size: 1000,
+                           convert_values_to_numeric: false) do |chunk|
+          Parallel.each(chunk.each_slice(100),
+                        finish: (lambda do |_, _, _|
+                          next if progress.nil?
+                          inc = [progress.total - progress.current, 100].min
+                          progress.increment inc
+                        end)) do |minichunk|
+            load_simple_chunk minichunk, entity, simple_index_list
+          end
+        end
+      end
+      # Load a chunk of data from a simple entity index
+      # @return [void]
+      def load_simple_chunk(chunk, entity, indexes)
+        # Prefix all hash keys with the entity name and convert values
+        chunk.map! do |row|
+          index_row = {}
+          row.each_key do |key|
+            field_class = entity[key.to_s].class
+            value = field_class.value_from_string row[key]
+            index_row["#{entity.name}_#{key}"] = value
+          end
+          index_row
+        end
+        # Insert the batch into the index
+        indexes.each do |index|
+          @backend.index_insert_chunk index, chunk
+        end
+      end
+    end
+  end
+end