RubyGems - active_data_frame - Versions diffs - 0.1.5 → 0.1.6 - Mend

active_data_frame 0.1.5 → 0.1.6

Files changed (9) hide show

checksums.yaml +4 -4
data/lib/active_data_frame.rb +6 -3
data/lib/active_data_frame/data_frame_proxy.rb +27 -2
data/lib/active_data_frame/database.rb +90 -68
data/lib/active_data_frame/row.rb +61 -4
data/lib/active_data_frame/table.rb +2 -4
data/lib/active_data_frame/version.rb +1 -1
data/lib/generators/active_data_frame/install_generator.rb +1 -1
metadata +2 -2

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 6e6c248d13e0f7f10933eca32158e4fb33a080e3
-  data.tar.gz: 28d52390deef35b8e582942612989f99e2026ed3
+  metadata.gz: 32b9b56e2515e43f2a30a10a35fd3c86d8de0bdf
+  data.tar.gz: 875f4e3f4cd9f19d4b0141b4c34a9420a36a06cd
 SHA512:
-  metadata.gz: 9fd95c152778f43ea9d3d3e09160a22ed355989b5fdc5e2cbbfb1f10b2290aab4db07d04344acb595f30f28a98eaf1282b4a461611ba6c5999b5a060fc60ae77
-  data.tar.gz: c0b32d8827258e8e8cf38e051758d8de8a1784bb2a4b19cdaada75af5b9b1541b3c45d15e6d2f033ca07ad1ec9422fc290c11b44e41fd2df5ef08b68bf805c47
+  metadata.gz: a4558730591012b79e19b19588a925e54b6678fdc279dc457bbad11c08ffdd3b2d4fd55c5d5668e90ed19c1ada58cd9ec73614fbfdbb72765f904ddd5c0509d1
+  data.tar.gz: f9b7852d03b01c23144c12780b87953e8d89fa039635e3bd2a7eb3029ad9ce0df6ee882c8bc1a2b902b7500a74d861aa8ace3720eb9d4b62b79abc5f40e8f9cc

data/lib/active_data_frame.rb CHANGED

@@ -8,7 +8,10 @@ require 'rmatrix'
 module ActiveDataFrame
   CONFIG = OpenStruct.new({
-    suppress_logs: true
+    suppress_logs: false,
+    insert_max_batch_size: 10_000,
+    update_max_batch_size: 10_000,
+    delete_max_batch_size: 10_000,
   })
   module_function
@@ -16,7 +19,7 @@ module ActiveDataFrame
       yield CONFIG
     end
-    def suppress_logs
-      CONFIG.suppress_logs
+    CONFIG.each_pair do |(key)|
+      define_method(key){ CONFIG.send(key) }
     end
 end

data/lib/active_data_frame/data_frame_proxy.rb CHANGED

@@ -31,12 +31,27 @@ module ActiveDataFrame
     def []=(from, values)
       values = Array(values).flatten.map(&@value_map.method(:[])) if @value_map
       from = column_map[from] if column_map && column_map[from]
-      set(from, M[values, typecode: block_type::TYPECODE].to_a.flatten)
+      if values.kind_of?(Hash)
+        values = verify_and_cleanse_hash_values(values)
+      else
+        values = M[values, typecode: block_type::TYPECODE].to_a.flatten
+      end
+      set(from, values)
+    end
+    def verify_and_cleanse_hash_values(map)
+      length = nil
+      map.transform_values do |values|
+        cleansed = M[values, typecode: block_type::TYPECODE].to_a.flatten
+        raise "All streams provided via a hash must be of the same length" if length && length != cleansed.length
+        length ||= cleansed.length
+        cleansed
+      end
     end
     def clear(*ranges)
       extract_ranges(ranges).each do |r|
-        set(r.first, M.blank(columns: r.last - r.first, typecode: block_type::TYPECODE))
+        set(r.first, M.blank(columns: r.last - r.first, typecode: block_type::TYPECODE), trim: true)
       end
     end
@@ -96,6 +111,10 @@ module ActiveDataFrame
     end
     def get_bounds(from, to, index=0)
+      self.class.get_bounds(from, to, block_type, index)
+    end
+    def self.get_bounds(from, to, block_type, index=0)
       from_block_index  = from / block_type::BLOCK_SIZE
       from_block_offset = from % block_type::BLOCK_SIZE
       to_block_index    = to / block_type::BLOCK_SIZE
@@ -117,6 +136,12 @@ module ActiveDataFrame
     end
     def iterate_bounds(all_bounds)
+      self.class.iterate_bounds(all_bounds, block_type) do |index, left, right, cursor, size|
+        yield index, left, right, cursor, size
+      end
+    end
+    def self.iterate_bounds(all_bounds, block_type)
       cursor = 0
       all_bounds.each do |bounds|
         index = bounds.from.index

data/lib/active_data_frame/database.rb CHANGED

@@ -58,93 +58,115 @@ module ActiveDataFrame
       self.batching = prev_batch
       flush! unless self.batching
     end
+    def bulk_upsert(updates, inserts)
+      Database.batch do
+        updates.group_by(&:keys).transform_values{|v| v.map(&:values) }.each do |columns, rows|
+          update = rows.map{|df_id, period_index, *values| [period_index, [values, df_id]] }
+          bulk_update(update, columns - [:data_frame_id, :period_index])
+        end
+        inserts.group_by(&:keys).transform_values{|v| v.map(&:values) }.each do |columns, rows|
+          insert = rows.map{|df_id, period_index, *values| [period_index, [values, df_id]] }
+          bulk_insert(insert, columns - [:data_frame_id, :period_index])
+        end
+      end
+    end
     ##
     # Update block data for all blocks in a single call
     ##
-    def bulk_update(existing)
-      case ActiveRecord::Base.connection_config[:adapter]
-      when 'postgresql'.freeze
+    def bulk_update(existing, columns=block_type::COLUMNS)
+      existing.each_slice(ActiveDataFrame.update_max_batch_size) do |existing_slice|
+        # puts "Updating slice of #{existing_slice.length}"
+        case ActiveRecord::Base.connection_config[:adapter]
+        when 'postgresql'.freeze
+          #
+          # PostgreSQL Supports the fast setting of multiple update values that differ
+          # per row from a temporary table.
+          #
+          updates = ''
+          existing_slice.each do |period_index, (values, df_id)|
+            updates <<  "(#{df_id}, #{period_index}, #{values.map{|v| v.inspect.gsub('"',"'") }.join(',')}),"
+          end
+          Database.execute(
+            <<-SQL
+            UPDATE #{block_type.table_name}
+              SET #{columns.map{|col| "#{col} = t.#{col}" }.join(", ")}
+              FROM(
+              VALUES #{updates[0..-2]}) as t(data_frame_id, period_index, #{columns.join(',')})
+              WHERE #{block_type.table_name}.data_frame_id = t.data_frame_id
+              AND #{block_type.table_name}.period_index = t.period_index
+              AND #{block_type.table_name}.data_frame_type = '#{data_frame_type.name}'
+            SQL
+          )
         #
-        # PostgreSQL Supports the fast setting of multiple update values that differ
-        # per row from a temporary table.
+        # For MySQL we use the ON DUPLICATE KEY UPDATE functionality.
+        # This relies on there being a unique index dataframe and period index
+        # on the blocks table.
+        # This tends to be faster than the general CASE based solution below
+        # but slower than the PostgreSQL solution above
         #
-        updates = ''
-        existing.each do |period_index, (values, df_id)|
-          updates <<  "(#{df_id}, #{period_index}, #{values.map{|v| v.inspect.gsub('"',"'") }.join(',')}),"
-        end
-        Database.execute(
-          <<-SQL
-          UPDATE #{block_type.table_name}
-            SET #{block_type::COLUMNS.map{|col| "#{col} = t.#{col}" }.join(", ")}
-            FROM(
-            VALUES #{updates[0..-2]}) as t(data_frame_id, period_index, #{block_type::COLUMNS.join(',')})
-            WHERE #{block_type.table_name}.data_frame_id = t.data_frame_id
-            AND #{block_type.table_name}.period_index = t.period_index
+        when 'mysql2'.freeze
+          # Fast bulk update
+          updates, on_duplicate = "", ""
+          existing_slice.each do |period_index, (values, df_id)|
+            updates << "(#{values.map{|v| v.inspect.gsub('"',"'") }.join(',')}, #{df_id}, #{period_index}, '#{data_frame_type.name}'),"
+          end
+          on_duplicate = columns.map do |cname|
+            "#{cname}=VALUES(#{cname})"
+          end.join(", ")
+          stmt = <<-SQL
+            INSERT INTO #{block_type.table_name} (#{columns.join(',')},data_frame_id,period_index,data_frame_type)
+            VALUES #{updates[0..-2]}
+            ON DUPLICATE KEY UPDATE #{on_duplicate}
+          SQL
+          Database.execute(stmt)
+        else
+          #
+          # General CASE based solution for multiple differing updates
+          # set per row.
+          # We use a CASE statement per column which determines the column
+          # to set based on the period index
+          #
+          ids = existing_slice.map {|_, (_, id)| id}
+          updates = columns.map.with_index do |column, column_idx|
+            [column, "CASE period_index\n#{existing_slice.map{|period_index, (values, _)| "WHEN #{period_index} then #{values[column_idx]}"}.join("\n")} \nEND\n"]
+          end.to_h
+          update_statement = updates.map{|cl, up| "#{cl} = #{up}" }.join(', ')
+          Database.execute(<<-SQL
+            UPDATE #{block_type.table_name} SET #{update_statement} WHERE
+            #{block_type.table_name}.data_frame_id IN (#{ids.join(',')})
             AND #{block_type.table_name}.data_frame_type = '#{data_frame_type.name}'
+            AND #{block_type.table_name}.period_index IN (#{existing_slice.map(&:first).join(', ')});
           SQL
-        )
-      #
-      # For MySQL we use the ON DUPLICATE KEY UPDATE functionality.
-      # This relies on there being a unique index dataframe and period index
-      # on the blocks table.
-      # This tends to be faster than the general CASE based solution below
-      # but slower than the PostgreSQL solution above
-      #
-      when 'mysql2'.freeze
-        # Fast bulk update
-        updates, on_duplicate = "", ""
-        existing.each do |period_index, (values, df_id)|
-          updates << "(#{values.map{|v| v.inspect.gsub('"',"'") }.join(',')}, #{df_id}, #{period_index}, '#{data_frame_type.name}'),"
+          )
         end
-        on_duplicate = block_type::COLUMNS.map do |cname|
-          "#{cname}=VALUES(#{cname})"
-        end.join(", ")
-        stmt = <<-SQL
-          INSERT INTO #{block_type.table_name} (#{block_type::COLUMNS.join(',')},data_frame_id,period_index,data_frame_type)
-          VALUES #{updates[0..-2]}
-          ON DUPLICATE KEY UPDATE #{on_duplicate}
-        SQL
-        Database.execute(stmt)
-      else
-        #
-        # General CASE based solution for multiple differing updates
-        # set per row.
-        # We use a CASE statement per column which determines the column
-        # to set based on the period index
-        #
-        ids = existing.map {|_, (_, id)| id}
-        updates = block_type::COLUMNS.map.with_index do |column, column_idx|
-          [column, "CASE period_index\n#{existing.map{|period_index, (values, _)| "WHEN #{period_index} then #{values[column_idx]}"}.join("\n")} \nEND\n"]
-        end.to_h
-        update_statement = updates.map{|cl, up| "#{cl} = #{up}" }.join(', ')
-        Database.execute(<<-SQL
-          UPDATE #{block_type.table_name} SET #{update_statement} WHERE
-          #{block_type.table_name}.data_frame_id IN (#{ids.join(',')})
-          AND #{block_type.table_name}.data_frame_type = '#{data_frame_type.name}'
-          AND #{block_type.table_name}.period_index IN (#{existing.keys.join(', ')});
-        SQL
-        )
       end
     end
     def bulk_delete(id, indices)
-      block_type.where(data_frame_id: id, period_index: indices).delete_all
+      indices.each_slice(ActiveDataFrame.delete_max_batch_size) do |slice|
+        # puts "Deleting slice of #{slice.length}"
+        block_type.where(data_frame_id: id, period_index: slice).delete_all
+      end
     end
     ##
     # Insert block data for all blocks in a single call
     ##
-    def bulk_insert(new_blocks, instance)
-      inserts = ''
-      new_blocks.each do |period_index, (values)|
-        inserts << \
-        case ActiveRecord::Base.connection_config[:adapter]
-        when 'postgresql', 'mysql2' then "(#{values.map{|v| v.inspect.gsub('"',"'") }.join(',')}, #{instance.id}, #{period_index}, '#{data_frame_type.name}'),"
-        else "(#{values.map{|v| v.inspect.gsub('"',"'") }.join(',')}, #{instance.id}, #{period_index}, '#{data_frame_type.name}'),"
+    def bulk_insert(new_blocks, columns=block_type::COLUMNS)
+      new_blocks.each_slice(ActiveDataFrame.insert_max_batch_size) do |new_blocks_slice|
+        # puts "Inserting slice of #{new_blocks_slice.length}"
+        inserts = ''
+        new_blocks_slice.each do |period_index, (values, df_id)|
+          inserts << \
+          case ActiveRecord::Base.connection_config[:adapter]
+          when 'postgresql', 'mysql2' then "(#{values.map{|v| v.inspect.gsub('"',"'") }.join(',')}, #{df_id}, #{period_index}, '#{data_frame_type.name}'),"
+          else "(#{values.map{|v| v.inspect.gsub('"',"'") }.join(',')}, #{df_id}, #{period_index}, '#{data_frame_type.name}'),"
+          end
         end
+        sql = "INSERT INTO #{block_type.table_name} (#{columns.join(',')}, data_frame_id, period_index, data_frame_type) VALUES #{inserts[0..-2]}"
+        Database.execute sql
       end
-      sql = "INSERT INTO #{block_type.table_name} (#{block_type::COLUMNS.join(',')}, data_frame_id, period_index, data_frame_type) VALUES #{inserts[0..-2]}"
-      Database.execute sql
     end
   end
 end

data/lib/active_data_frame/row.rb CHANGED

@@ -12,15 +12,72 @@ module ActiveDataFrame
       "#{data_frame_type.name} Row(#{instance.id})"
     end
-    def set(from, values)
+    def self.set_all(scope, block_type, data_frame_type, from, values, trim: false)
+      if trim || ActiveRecord::Base.connection_config[:adapter] === 'mysql2'
+        scope.each do |instance|
+          Row.new(block_type, data_frame_type, instance).patch(from, values.kind_of?(Hash) ? values[instance.id] : values)
+        end
+      else
+        upsert_all(scope, block_type, data_frame_type, from, values)
+      end
+    end
+    def self.upsert_all(rows, block_type, data_frame_type, from, values)
+      length                 = values.kind_of?(Hash) ? values.values.first.length : values.length
+      to                     = from + length - 1
+      bounds                 = get_bounds(from, to, block_type)
+      scope                  = block_type.where(data_frame_type: data_frame_type.name, data_frame_id: rows.select(:id))
+      scope                  = scope.where(data_frame_id: values.keys) if values.kind_of?(Hash)
+      all_update_indices     = scope.where(period_index: bounds.from.index..bounds.to.index).order(data_frame_id: :asc, period_index: :asc).pluck(:data_frame_id, :period_index)
+      grouped_update_indices = all_update_indices.group_by(&:first).transform_values{|value| Set.new(value.map!(&:last)) }
+      instance_ids           = rows.pluck(:id)
+      instance_ids           &= values.keys if values.kind_of?(Hash)
+      upserts = to_enum(:iterate_bounds, [bounds], block_type).flat_map do |index, left, right, cursor, size|
+        instance_ids.map do |instance_id|
+          slice = values.kind_of?(Hash) ? values[instance_id][cursor...cursor + size] : values[cursor...cursor + size]
+          [[:data_frame_id, instance_id], [:period_index, index], *(left.succ..right.succ).map{|v| :"t#{v}" }.zip(slice)].to_h
+        end
+      end
+      update, insert = upserts.partition{|upsert| grouped_update_indices[upsert[:data_frame_id]]&.include?(upsert[:period_index]) }
+      Database.for_types(block: block_type, df: data_frame_type).bulk_upsert(update, insert)
+      values
+    end
+    def set(from, values, trim: false)
+      if trim || ActiveRecord::Base.connection_config[:adapter] === 'mysql2'
+        patch(from, values)
+      else
+        upsert(from, values)
+      end
+    end
+    def upsert(from, values)
+      to             = (from + values.length) - 1
+      bounds         = get_bounds(from, to)
+      update_indices = Set.new(scope.where(period_index: bounds.from.index..bounds.to.index).order(period_index: :asc).pluck(:period_index))
+      # Detect blocks in bounds:
+      # - If existing and covered, do an update without load
+      # - If existing and uncovered, do a small write (without load)
+      # - If not existing, insert!
+      upserts = to_enum(:iterate_bounds, [bounds]).map do |index, left, right, cursor, size|
+        [[:data_frame_id, self.instance.id], [:period_index, index], *(left.succ..right.succ).map{|v| :"t#{v}" }.zip(values[cursor...cursor + size])].to_h
+      end
+      update, insert = upserts.partition{|upsert| update_indices.include?(upsert[:period_index]) }
+      database.bulk_upsert(update, insert)
+      values
+    end
+    def patch(from, values)
       to     = (from + values.length) - 1
       bounds = get_bounds(from, to)
       new_blocks = Hash.new do |h, k|
-        h[k] = [[0] * block_type::BLOCK_SIZE]
+        h[k] = [[0] * block_type::BLOCK_SIZE, self.instance.id]
       end
       deleted_indices = []
       existing = blocks_between([bounds]).pluck(:data_frame_id, :period_index, *block_type::COLUMNS).map do |id, period_index, *block_values|
         [period_index, [block_values, id]]
       end.to_h
@@ -41,8 +98,8 @@ module ActiveDataFrame
       database.bulk_delete(self.instance.id, deleted_indices) unless deleted_indices.size.zero?
-      database.bulk_update(existing)                 unless existing.size.zero?
-      database.bulk_insert(new_blocks, instance)     unless new_blocks.size.zero?
+      database.bulk_update(existing)       unless existing.size.zero?
+      database.bulk_insert(new_blocks)     unless new_blocks.size.zero?
       values
     end

data/lib/active_data_frame/table.rb CHANGED

@@ -1,11 +1,9 @@
 module ActiveDataFrame
   class Table < DataFrameProxy
-    def set(from, values)
+    def set(from, values, trim: false)
       ActiveDataFrame::Database.batch do
-        data_frame_type.each do |instance|
-          Row.new(self.block_type, self.data_frame_type, instance).set(from, values)
-        end
+        Row.set_all(data_frame_type, self.block_type, self.data_frame_type, from, values, trim: trim)
       end
     end

data/lib/active_data_frame/version.rb CHANGED

@@ -1,3 +1,3 @@
 module ActiveDataFrame
-  VERSION = "0.1.5"
+  VERSION = "0.1.6"
 end

data/lib/generators/active_data_frame/install_generator.rb CHANGED

@@ -87,7 +87,7 @@ RUBY
       t.integer :period_index
 #{
     columns.times.map do |i|
-"      t.#{type} :t#{i+1}"
+"      t.#{type} :t#{i+1}, default: 0, allow_nil: false"
     end.join("\n")
     }
 RUBY

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: active_data_frame
 version: !ruby/object:Gem::Version
-  version: 0.1.5
+  version: 0.1.6
 platform: ruby
 authors:
 - Wouter Coppieters
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2018-06-19 00:00:00.000000000 Z
+date: 2018-07-19 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler