RubyGems - active_data_frame - Versions diffs - 0.1.5 → 0.1.6 - Mend

active_data_frame 0.1.5 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

checksums.yaml +4 -4
data/lib/active_data_frame.rb +6 -3
data/lib/active_data_frame/data_frame_proxy.rb +27 -2
data/lib/active_data_frame/database.rb +90 -68
data/lib/active_data_frame/row.rb +61 -4
data/lib/active_data_frame/table.rb +2 -4
data/lib/active_data_frame/version.rb +1 -1
data/lib/generators/active_data_frame/install_generator.rb +1 -1
metadata +2 -2

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 6e6c248d13e0f7f10933eca32158e4fb33a080e3
-  data.tar.gz: 28d52390deef35b8e582942612989f99e2026ed3
+  metadata.gz: 32b9b56e2515e43f2a30a10a35fd3c86d8de0bdf
+  data.tar.gz: 875f4e3f4cd9f19d4b0141b4c34a9420a36a06cd
 SHA512:
-  metadata.gz: 9fd95c152778f43ea9d3d3e09160a22ed355989b5fdc5e2cbbfb1f10b2290aab4db07d04344acb595f30f28a98eaf1282b4a461611ba6c5999b5a060fc60ae77
-  data.tar.gz: c0b32d8827258e8e8cf38e051758d8de8a1784bb2a4b19cdaada75af5b9b1541b3c45d15e6d2f033ca07ad1ec9422fc290c11b44e41fd2df5ef08b68bf805c47
+  metadata.gz: a4558730591012b79e19b19588a925e54b6678fdc279dc457bbad11c08ffdd3b2d4fd55c5d5668e90ed19c1ada58cd9ec73614fbfdbb72765f904ddd5c0509d1
+  data.tar.gz: f9b7852d03b01c23144c12780b87953e8d89fa039635e3bd2a7eb3029ad9ce0df6ee882c8bc1a2b902b7500a74d861aa8ace3720eb9d4b62b79abc5f40e8f9cc

data/lib/active_data_frame.rb CHANGED

@@ -8,7 +8,10 @@ require 'rmatrix'
 module ActiveDataFrame
   CONFIG = OpenStruct.new({
-    suppress_logs: true
+    suppress_logs: false,
+    insert_max_batch_size: 10_000,
+    update_max_batch_size: 10_000,
+    delete_max_batch_size: 10_000,
   })
   module_function
@@ -16,7 +19,7 @@ module ActiveDataFrame
       yield CONFIG
     end
-    def suppress_logs
-      CONFIG.suppress_logs
+    CONFIG.each_pair do |(key)|
+      define_method(key){ CONFIG.send(key) }
     end
 end

data/lib/active_data_frame/data_frame_proxy.rb CHANGED

@@ -31,12 +31,27 @@ module ActiveDataFrame
     def []=(from, values)
       values = Array(values).flatten.map(&@value_map.method(:[])) if @value_map
       from = column_map[from] if column_map && column_map[from]
-      set(from, M[values, typecode: block_type::TYPECODE].to_a.flatten)
+      if values.kind_of?(Hash)
+        values = verify_and_cleanse_hash_values(values)
+      else
+        values = M[values, typecode: block_type::TYPECODE].to_a.flatten
+      end
+      set(from, values)
+    end
+    def verify_and_cleanse_hash_values(map)
+      length = nil
+      map.transform_values do |values|
+        cleansed = M[values, typecode: block_type::TYPECODE].to_a.flatten
+        raise "All streams provided via a hash must be of the same length" if length && length != cleansed.length
+        length ||= cleansed.length
+        cleansed
+      end
     end
     def clear(*ranges)
       extract_ranges(ranges).each do |r|
-        set(r.first, M.blank(columns: r.last - r.first, typecode: block_type::TYPECODE))
+        set(r.first, M.blank(columns: r.last - r.first, typecode: block_type::TYPECODE), trim: true)
       end
     end
@@ -96,6 +111,10 @@ module ActiveDataFrame
     end
     def get_bounds(from, to, index=0)
+      self.class.get_bounds(from, to, block_type, index)
+    end
+    def self.get_bounds(from, to, block_type, index=0)
       from_block_index  = from / block_type::BLOCK_SIZE
       from_block_offset = from % block_type::BLOCK_SIZE
       to_block_index    = to / block_type::BLOCK_SIZE
@@ -117,6 +136,12 @@ module ActiveDataFrame
     end
     def iterate_bounds(all_bounds)
+      self.class.iterate_bounds(all_bounds, block_type) do |index, left, right, cursor, size|
+        yield index, left, right, cursor, size
+      end
+    end
+    def self.iterate_bounds(all_bounds, block_type)
       cursor = 0
       all_bounds.each do |bounds|
         index = bounds.from.index

data/lib/active_data_frame/database.rb CHANGED

@@ -58,93 +58,115 @@ module ActiveDataFrame
       self.batching = prev_batch
       flush! unless self.batching
     end
+    def bulk_upsert(updates, inserts)
+      Database.batch do
+        updates.group_by(&:keys).transform_values{|v| v.map(&:values) }.each do |columns, rows|
+          update = rows.map{|df_id, period_index, *values| [period_index, [values, df_id]] }
+          bulk_update(update, columns - [:data_frame_id, :period_index])
+        end
+        inserts.group_by(&:keys).transform_values{|v| v.map(&:values) }.each do |columns, rows|
+          insert = rows.map{|df_id, period_index, *values| [period_index, [values, df_id]] }
+          bulk_insert(insert, columns - [:data_frame_id, :period_index])
+        end
+      end
+    end
     ##
     # Update block data for all blocks in a single call
     ##
-    def bulk_update(existing)
-      case ActiveRecord::Base.connection_config[:adapter]
-      when 'postgresql'.freeze
+    def bulk_update(existing, columns=block_type::COLUMNS)
+      existing.each_slice(ActiveDataFrame.update_max_batch_size) do |existing_slice|
+        # puts "Updating slice of #{existing_slice.length}"
+        case ActiveRecord::Base.connection_config[:adapter]
+        when 'postgresql'.freeze
+          #
+          # PostgreSQL Supports the fast setting of multiple update values that differ
+          # per row from a temporary table.
+          #
+          updates = ''
+          existing_slice.each do |period_index, (values, df_id)|
+            updates <<  "(#{df_id}, #{period_index}, #{values.map{|v| v.inspect.gsub('"',"'") }.join(',')}),"
+          end
+          Database.execute(
+            <<-SQL
+            UPDATE #{block_type.table_name}
+              SET #{columns.map{|col| "#{col} = t.#{col}" }.join(", ")}
+              FROM(
+              VALUES #{updates[0..-2]}) as t(data_frame_id, period_index, #{columns.join(',')})
+              WHERE #{block_type.table_name}.data_frame_id = t.data_frame_id
+              AND #{block_type.table_name}.period_index = t.period_index
+              AND #{block_type.table_name}.data_frame_type = '#{data_frame_type.name}'
+            SQL
+          )
         #
-        # PostgreSQL Supports the fast setting of multiple update values that differ
-        # per row from a temporary table.
+        # For MySQL we use the ON DUPLICATE KEY UPDATE functionality.
+        # This relies on there being a unique index dataframe and period index
+        # on the blocks table.
+        # This tends to be faster than the general CASE based solution below
+        # but slower than the PostgreSQL solution above
         #
-        updates = ''
-        existing.each do |period_index, (values, df_id)|
-          updates <<  "(#{df_id}, #{period_index}, #{values.map{|v| v.inspect.gsub('"',"'") }.join(',')}),"
-        end
-        Database.execute(
-          <<-SQL
-          UPDATE #{block_type.table_name}
-            SET #{block_type::COLUMNS.map{|col| "#{col} = t.#{col}" }.join(", ")}
-            FROM(
-            VALUES #{updates[0..-2]}) as t(data_frame_id, period_index, #{block_type::COLUMNS.join(',')})
-            WHERE #{block_type.table_name}.data_frame_id = t.data_frame_id
-            AND #{block_type.table_name}.period_index = t.period_index
+        when 'mysql2'.freeze
+          # Fast bulk update
+          updates, on_duplicate = "", ""
+          existing_slice.each do |period_index, (values, df_id)|
+            updates << "(#{values.map{|v| v.inspect.gsub('"',"'") }.join(',')}, #{df_id}, #{period_index}, '#{data_frame_type.name}'),"
+          end
+          on_duplicate = columns.map do |cname|
+            "#{cname}=VALUES(#{cname})"
+          end.join(", ")
+          stmt = <<-SQL
+            INSERT INTO #{block_type.table_name} (#{columns.join(',')},data_frame_id,period_index,data_frame_type)
+            VALUES #{updates[0..-2]}
+            ON DUPLICATE KEY UPDATE #{on_duplicate}
+          SQL
+          Database.execute(stmt)
+        else
+          #
+          # General CASE based solution for multiple differing updates
+          # set per row.
+          # We use a CASE statement per column which determines the column
+          # to set based on the period index
+          #
+          ids = existing_slice.map {|_, (_, id)| id}
+          updates = columns.map.with_index do |column, column_idx|
+            [column, "CASE period_index\n#{existing_slice.map{|period_index, (values, _)| "WHEN #{period_index} then #{values[column_idx]}"}.join("\n")} \nEND\n"]
+          end.to_h
+          update_statement = updates.map{|cl, up| "#{cl} = #{up}" }.join(', ')
+          Database.execute(<<-SQL
+            UPDATE #{block_type.table_name} SET #{update_statement} WHERE
+            #{block_type.table_name}.data_frame_id IN (#{ids.join(',')})
             AND #{block_type.table_name}.data_frame_type = '#{data_frame_type.name}'
+            AND #{block_type.table_name}.period_index IN (#{existing_slice.map(&:first).join(', ')});
           SQL
-        )
-      #
-      # For MySQL we use the ON DUPLICATE KEY UPDATE functionality.
-      # This relies on there being a unique index dataframe and period index
-      # on the blocks table.
-      # This tends to be faster than the general CASE based solution below
-      # but slower than the PostgreSQL solution above
-      #
-      when 'mysql2'.freeze
-        # Fast bulk update
-        updates, on_duplicate = "", ""
-        existing.each do |period_index, (values, df_id)|
-          updates << "(#{values.map{|v| v.inspect.gsub('"',"'") }.join(',')}, #{df_id}, #{period_index}, '#{data_frame_type.name}'),"
+          )
         end
-        on_duplicate = block_type::COLUMNS.map do |cname|
-          "#{cname}=VALUES(#{cname})"
-        end.join(", ")
-        stmt = <<-SQL
-          INSERT INTO #{block_type.table_name} (#{block_type::COLUMNS.join(',')},data_frame_id,period_index,data_frame_type)
-          VALUES #{updates[0..-2]}
-          ON DUPLICATE KEY UPDATE #{on_duplicate}
-        SQL
-        Database.execute(stmt)
-      else
-        #
-        # General CASE based solution for multiple differing updates
-        # set per row.
-        # We use a CASE statement per column which determines the column
-        # to set based on the period index
-        #
-        ids = existing.map {|_, (_, id)| id}
-        updates = block_type::COLUMNS.map.with_index do |column, column_idx|
-          [column, "CASE period_index\n#{existing.map{|period_index, (values, _)| "WHEN #{period_index} then #{values[column_idx]}"}.join("\n")} \nEND\n"]
-        end.to_h
-        update_statement = updates.map{|cl, up| "#{cl} = #{up}" }.join(', ')
-        Database.execute(<<-SQL
-          UPDATE #{block_type.table_name} SET #{update_statement} WHERE
-          #{block_type.table_name}.data_frame_id IN (#{ids.join(',')})
-          AND #{block_type.table_name}.data_frame_type = '#{data_frame_type.name}'
-          AND #{block_type.table_name}.period_index IN (#{existing.keys.join(', ')});
-        SQL
-        )
       end
     end
     def bulk_delete(id, indices)
-      block_type.where(data_frame_id: id, period_index: indices).delete_all
+      indices.each_slice(ActiveDataFrame.delete_max_batch_size) do |slice|
+        # puts "Deleting slice of #{slice.length}"
+        block_type.where(data_frame_id: id, period_index: slice).delete_all
+      end
     end
     ##
     # Insert block data for all blocks in a single call
     ##
-    def bulk_insert(new_blocks, instance)
-      inserts = ''
-      new_blocks.each do |period_index, (values)|
-        inserts << \
-        case ActiveRecord::Base.connection_config[:adapter]
-        when 'postgresql', 'mysql2' then "(#{values.map{|v| v.inspect.gsub('"',"'") }.join(',')}, #{instance.id}, #{period_index}, '#{data_frame_type.name}'),"
-        else "(#{values.map{|v| v.inspect.gsub('"',"'") }.join(',')}, #{instance.id}, #{period_index}, '#{data_frame_type.name}'),"
+    def bulk_insert(new_blocks, columns=block_type::COLUMNS)
+      new_blocks.each_slice(ActiveDataFrame.insert_max_batch_size) do |new_blocks_slice|
+        # puts "Inserting slice of #{new_blocks_slice.length}"
+        inserts = ''
+        new_blocks_slice.each do |period_index, (values, df_id)|
+          inserts << \
+          case ActiveRecord::Base.connection_config[:adapter]
+          when 'postgresql', 'mysql2' then "(#{values.map{|v| v.inspect.gsub('"',"'") }.join(',')}, #{df_id}, #{period_index}, '#{data_frame_type.name}'),"
+          else "(#{values.map{|v| v.inspect.gsub('"',"'") }.join(',')}, #{df_id}, #{period_index}, '#{data_frame_type.name}'),"
+          end
         end
+        sql = "INSERT INTO #{block_type.table_name} (#{columns.join(',')}, data_frame_id, period_index, data_frame_type) VALUES #{inserts[0..-2]}"
+        Database.execute sql
       end
-      sql = "INSERT INTO #{block_type.table_name} (#{block_type::COLUMNS.join(',')}, data_frame_id, period_index, data_frame_type) VALUES #{inserts[0..-2]}"
-      Database.execute sql
     end
   end
 end

data/lib/active_data_frame/row.rb CHANGED

@@ -12,15 +12,72 @@ module ActiveDataFrame
       "#{data_frame_type.name} Row(#{instance.id})"
     end
-    def set(from, values)
+    def self.set_all(scope, block_type, data_frame_type, from, values, trim: false)
+      if trim || ActiveRecord::Base.connection_config[:adapter] === 'mysql2'
+        scope.each do |instance|
+          Row.new(block_type, data_frame_type, instance).patch(from, values.kind_of?(Hash) ? values[instance.id] : values)
+        end
+      else
+        upsert_all(scope, block_type, data_frame_type, from, values)
+      end
+    end
+    def self.upsert_all(rows, block_type, data_frame_type, from, values)
+      length                 = values.kind_of?(Hash) ? values.values.first.length : values.length
+      to                     = from + length - 1
+      bounds                 = get_bounds(from, to, block_type)
+      scope                  = block_type.where(data_frame_type: data_frame_type.name, data_frame_id: rows.select(:id))
+      scope                  = scope.where(data_frame_id: values.keys) if values.kind_of?(Hash)
+      all_update_indices     = scope.where(period_index: bounds.from.index..bounds.to.index).order(data_frame_id: :asc, period_index: :asc).pluck(:data_frame_id, :period_index)
+      grouped_update_indices = all_update_indices.group_by(&:first).transform_values{|value| Set.new(value.map!(&:last)) }
+      instance_ids           = rows.pluck(:id)
+      instance_ids           &= values.keys if values.kind_of?(Hash)
+      upserts = to_enum(:iterate_bounds, [bounds], block_type).flat_map do |index, left, right, cursor, size|
+        instance_ids.map do |instance_id|
+          slice = values.kind_of?(Hash) ? values[instance_id][cursor...cursor + size] : values[cursor...cursor + size]
+          [[:data_frame_id, instance_id], [:period_index, index], *(left.succ..right.succ).map{|v| :"t#{v}" }.zip(slice)].to_h
+        end
+      end
+      update, insert = upserts.partition{|upsert| grouped_update_indices[upsert[:data_frame_id]]&.include?(upsert[:period_index]) }
+      Database.for_types(block: block_type, df: data_frame_type).bulk_upsert(update, insert)
+      values
+    end
+    def set(from, values, trim: false)
+      if trim || ActiveRecord::Base.connection_config[:adapter] === 'mysql2'
+        patch(from, values)
+      else
+        upsert(from, values)
+      end
+    end
+    def upsert(from, values)
+      to             = (from + values.length) - 1
+      bounds         = get_bounds(from, to)
+      update_indices = Set.new(scope.where(period_index: bounds.from.index..bounds.to.index).order(period_index: :asc).pluck(:period_index))
+      # Detect blocks in bounds:
+      # - If existing and covered, do an update without load
+      # - If existing and uncovered, do a small write (without load)
+      # - If not existing, insert!
+      upserts = to_enum(:iterate_bounds, [bounds]).map do |index, left, right, cursor, size|
+        [[:data_frame_id, self.instance.id], [:period_index, index], *(left.succ..right.succ).map{|v| :"t#{v}" }.zip(values[cursor...cursor + size])].to_h
+      end
+      update, insert = upserts.partition{|upsert| update_indices.include?(upsert[:period_index]) }
+      database.bulk_upsert(update, insert)
+      values
+    end
+    def patch(from, values)
       to     = (from + values.length) - 1
       bounds = get_bounds(from, to)
       new_blocks = Hash.new do |h, k|
-        h[k] = [[0] * block_type::BLOCK_SIZE]
+        h[k] = [[0] * block_type::BLOCK_SIZE, self.instance.id]
       end
       deleted_indices = []
       existing = blocks_between([bounds]).pluck(:data_frame_id, :period_index, *block_type::COLUMNS).map do |id, period_index, *block_values|
         [period_index, [block_values, id]]
       end.to_h
@@ -41,8 +98,8 @@ module ActiveDataFrame
       database.bulk_delete(self.instance.id, deleted_indices) unless deleted_indices.size.zero?
-      database.bulk_update(existing)                 unless existing.size.zero?
-      database.bulk_insert(new_blocks, instance)     unless new_blocks.size.zero?
+      database.bulk_update(existing)       unless existing.size.zero?
+      database.bulk_insert(new_blocks)     unless new_blocks.size.zero?
       values
     end

data/lib/active_data_frame/table.rb CHANGED

@@ -1,11 +1,9 @@
 module ActiveDataFrame
   class Table < DataFrameProxy
-    def set(from, values)
+    def set(from, values, trim: false)
       ActiveDataFrame::Database.batch do
-        data_frame_type.each do |instance|
-          Row.new(self.block_type, self.data_frame_type, instance).set(from, values)
-        end
+        Row.set_all(data_frame_type, self.block_type, self.data_frame_type, from, values, trim: trim)
       end
     end

data/lib/active_data_frame/version.rb CHANGED

@@ -1,3 +1,3 @@
 module ActiveDataFrame
-  VERSION = "0.1.5"
+  VERSION = "0.1.6"
 end

data/lib/generators/active_data_frame/install_generator.rb CHANGED

@@ -87,7 +87,7 @@ RUBY
       t.integer :period_index
 #{
     columns.times.map do |i|
-"      t.#{type} :t#{i+1}"
+"      t.#{type} :t#{i+1}, default: 0, allow_nil: false"
     end.join("\n")
     }
 RUBY

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: active_data_frame
 version: !ruby/object:Gem::Version
-  version: 0.1.5
+  version: 0.1.6
 platform: ruby
 authors:
 - Wouter Coppieters
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2018-06-19 00:00:00.000000000 Z
+date: 2018-07-19 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler