active_data_frame 0.1.5 → 0.1.6

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 6e6c248d13e0f7f10933eca32158e4fb33a080e3
4
- data.tar.gz: 28d52390deef35b8e582942612989f99e2026ed3
3
+ metadata.gz: 32b9b56e2515e43f2a30a10a35fd3c86d8de0bdf
4
+ data.tar.gz: 875f4e3f4cd9f19d4b0141b4c34a9420a36a06cd
5
5
  SHA512:
6
- metadata.gz: 9fd95c152778f43ea9d3d3e09160a22ed355989b5fdc5e2cbbfb1f10b2290aab4db07d04344acb595f30f28a98eaf1282b4a461611ba6c5999b5a060fc60ae77
7
- data.tar.gz: c0b32d8827258e8e8cf38e051758d8de8a1784bb2a4b19cdaada75af5b9b1541b3c45d15e6d2f033ca07ad1ec9422fc290c11b44e41fd2df5ef08b68bf805c47
6
+ metadata.gz: a4558730591012b79e19b19588a925e54b6678fdc279dc457bbad11c08ffdd3b2d4fd55c5d5668e90ed19c1ada58cd9ec73614fbfdbb72765f904ddd5c0509d1
7
+ data.tar.gz: f9b7852d03b01c23144c12780b87953e8d89fa039635e3bd2a7eb3029ad9ce0df6ee882c8bc1a2b902b7500a74d861aa8ace3720eb9d4b62b79abc5f40e8f9cc
@@ -8,7 +8,10 @@ require 'rmatrix'
8
8
 
9
9
  module ActiveDataFrame
10
10
  CONFIG = OpenStruct.new({
11
- suppress_logs: true
11
+ suppress_logs: false,
12
+ insert_max_batch_size: 10_000,
13
+ update_max_batch_size: 10_000,
14
+ delete_max_batch_size: 10_000,
12
15
  })
13
16
 
14
17
  module_function
@@ -16,7 +19,7 @@ module ActiveDataFrame
16
19
  yield CONFIG
17
20
  end
18
21
 
19
- def suppress_logs
20
- CONFIG.suppress_logs
22
+ CONFIG.each_pair do |(key)|
23
+ define_method(key){ CONFIG.send(key) }
21
24
  end
22
25
  end
@@ -31,12 +31,27 @@ module ActiveDataFrame
31
31
  def []=(from, values)
32
32
  values = Array(values).flatten.map(&@value_map.method(:[])) if @value_map
33
33
  from = column_map[from] if column_map && column_map[from]
34
- set(from, M[values, typecode: block_type::TYPECODE].to_a.flatten)
34
+ if values.kind_of?(Hash)
35
+ values = verify_and_cleanse_hash_values(values)
36
+ else
37
+ values = M[values, typecode: block_type::TYPECODE].to_a.flatten
38
+ end
39
+ set(from, values)
40
+ end
41
+
42
+ def verify_and_cleanse_hash_values(map)
43
+ length = nil
44
+ map.transform_values do |values|
45
+ cleansed = M[values, typecode: block_type::TYPECODE].to_a.flatten
46
+ raise "All streams provided via a hash must be of the same length" if length && length != cleansed.length
47
+ length ||= cleansed.length
48
+ cleansed
49
+ end
35
50
  end
36
51
 
37
52
  def clear(*ranges)
38
53
  extract_ranges(ranges).each do |r|
39
- set(r.first, M.blank(columns: r.last - r.first, typecode: block_type::TYPECODE))
54
+ set(r.first, M.blank(columns: r.last - r.first, typecode: block_type::TYPECODE), trim: true)
40
55
  end
41
56
  end
42
57
 
@@ -96,6 +111,10 @@ module ActiveDataFrame
96
111
  end
97
112
 
98
113
  def get_bounds(from, to, index=0)
114
+ self.class.get_bounds(from, to, block_type, index)
115
+ end
116
+
117
+ def self.get_bounds(from, to, block_type, index=0)
99
118
  from_block_index = from / block_type::BLOCK_SIZE
100
119
  from_block_offset = from % block_type::BLOCK_SIZE
101
120
  to_block_index = to / block_type::BLOCK_SIZE
@@ -117,6 +136,12 @@ module ActiveDataFrame
117
136
  end
118
137
 
119
138
  def iterate_bounds(all_bounds)
139
+ self.class.iterate_bounds(all_bounds, block_type) do |index, left, right, cursor, size|
140
+ yield index, left, right, cursor, size
141
+ end
142
+ end
143
+
144
+ def self.iterate_bounds(all_bounds, block_type)
120
145
  cursor = 0
121
146
  all_bounds.each do |bounds|
122
147
  index = bounds.from.index
@@ -58,93 +58,115 @@ module ActiveDataFrame
58
58
  self.batching = prev_batch
59
59
  flush! unless self.batching
60
60
  end
61
+
62
+ def bulk_upsert(updates, inserts)
63
+ Database.batch do
64
+ updates.group_by(&:keys).transform_values{|v| v.map(&:values) }.each do |columns, rows|
65
+ update = rows.map{|df_id, period_index, *values| [period_index, [values, df_id]] }
66
+ bulk_update(update, columns - [:data_frame_id, :period_index])
67
+ end
68
+ inserts.group_by(&:keys).transform_values{|v| v.map(&:values) }.each do |columns, rows|
69
+ insert = rows.map{|df_id, period_index, *values| [period_index, [values, df_id]] }
70
+ bulk_insert(insert, columns - [:data_frame_id, :period_index])
71
+ end
72
+ end
73
+ end
61
74
  ##
62
75
  # Update block data for all blocks in a single call
63
76
  ##
64
- def bulk_update(existing)
65
- case ActiveRecord::Base.connection_config[:adapter]
66
- when 'postgresql'.freeze
77
+ def bulk_update(existing, columns=block_type::COLUMNS)
78
+ existing.each_slice(ActiveDataFrame.update_max_batch_size) do |existing_slice|
79
+ # puts "Updating slice of #{existing_slice.length}"
80
+ case ActiveRecord::Base.connection_config[:adapter]
81
+ when 'postgresql'.freeze
82
+ #
83
+ # PostgreSQL Supports the fast setting of multiple update values that differ
84
+ # per row from a temporary table.
85
+ #
86
+ updates = ''
87
+ existing_slice.each do |period_index, (values, df_id)|
88
+ updates << "(#{df_id}, #{period_index}, #{values.map{|v| v.inspect.gsub('"',"'") }.join(',')}),"
89
+ end
90
+ Database.execute(
91
+ <<-SQL
92
+ UPDATE #{block_type.table_name}
93
+ SET #{columns.map{|col| "#{col} = t.#{col}" }.join(", ")}
94
+ FROM(
95
+ VALUES #{updates[0..-2]}) as t(data_frame_id, period_index, #{columns.join(',')})
96
+ WHERE #{block_type.table_name}.data_frame_id = t.data_frame_id
97
+ AND #{block_type.table_name}.period_index = t.period_index
98
+ AND #{block_type.table_name}.data_frame_type = '#{data_frame_type.name}'
99
+ SQL
100
+ )
67
101
  #
68
- # PostgreSQL Supports the fast setting of multiple update values that differ
69
- # per row from a temporary table.
102
+ # For MySQL we use the ON DUPLICATE KEY UPDATE functionality.
103
+ # This relies on there being a unique index dataframe and period index
104
+ # on the blocks table.
105
+ # This tends to be faster than the general CASE based solution below
106
+ # but slower than the PostgreSQL solution above
70
107
  #
71
- updates = ''
72
- existing.each do |period_index, (values, df_id)|
73
- updates << "(#{df_id}, #{period_index}, #{values.map{|v| v.inspect.gsub('"',"'") }.join(',')}),"
74
- end
75
- Database.execute(
76
- <<-SQL
77
- UPDATE #{block_type.table_name}
78
- SET #{block_type::COLUMNS.map{|col| "#{col} = t.#{col}" }.join(", ")}
79
- FROM(
80
- VALUES #{updates[0..-2]}) as t(data_frame_id, period_index, #{block_type::COLUMNS.join(',')})
81
- WHERE #{block_type.table_name}.data_frame_id = t.data_frame_id
82
- AND #{block_type.table_name}.period_index = t.period_index
108
+ when 'mysql2'.freeze
109
+ # Fast bulk update
110
+ updates, on_duplicate = "", ""
111
+ existing_slice.each do |period_index, (values, df_id)|
112
+ updates << "(#{values.map{|v| v.inspect.gsub('"',"'") }.join(',')}, #{df_id}, #{period_index}, '#{data_frame_type.name}'),"
113
+ end
114
+ on_duplicate = columns.map do |cname|
115
+ "#{cname}=VALUES(#{cname})"
116
+ end.join(", ")
117
+ stmt = <<-SQL
118
+ INSERT INTO #{block_type.table_name} (#{columns.join(',')},data_frame_id,period_index,data_frame_type)
119
+ VALUES #{updates[0..-2]}
120
+ ON DUPLICATE KEY UPDATE #{on_duplicate}
121
+ SQL
122
+ Database.execute(stmt)
123
+ else
124
+ #
125
+ # General CASE based solution for multiple differing updates
126
+ # set per row.
127
+ # We use a CASE statement per column which determines the column
128
+ # to set based on the period index
129
+ #
130
+ ids = existing_slice.map {|_, (_, id)| id}
131
+ updates = columns.map.with_index do |column, column_idx|
132
+ [column, "CASE period_index\n#{existing_slice.map{|period_index, (values, _)| "WHEN #{period_index} then #{values[column_idx]}"}.join("\n")} \nEND\n"]
133
+ end.to_h
134
+ update_statement = updates.map{|cl, up| "#{cl} = #{up}" }.join(', ')
135
+ Database.execute(<<-SQL
136
+ UPDATE #{block_type.table_name} SET #{update_statement} WHERE
137
+ #{block_type.table_name}.data_frame_id IN (#{ids.join(',')})
83
138
  AND #{block_type.table_name}.data_frame_type = '#{data_frame_type.name}'
139
+ AND #{block_type.table_name}.period_index IN (#{existing_slice.map(&:first).join(', ')});
84
140
  SQL
85
- )
86
- #
87
- # For MySQL we use the ON DUPLICATE KEY UPDATE functionality.
88
- # This relies on there being a unique index dataframe and period index
89
- # on the blocks table.
90
- # This tends to be faster than the general CASE based solution below
91
- # but slower than the PostgreSQL solution above
92
- #
93
- when 'mysql2'.freeze
94
- # Fast bulk update
95
- updates, on_duplicate = "", ""
96
- existing.each do |period_index, (values, df_id)|
97
- updates << "(#{values.map{|v| v.inspect.gsub('"',"'") }.join(',')}, #{df_id}, #{period_index}, '#{data_frame_type.name}'),"
141
+ )
98
142
  end
99
- on_duplicate = block_type::COLUMNS.map do |cname|
100
- "#{cname}=VALUES(#{cname})"
101
- end.join(", ")
102
- stmt = <<-SQL
103
- INSERT INTO #{block_type.table_name} (#{block_type::COLUMNS.join(',')},data_frame_id,period_index,data_frame_type)
104
- VALUES #{updates[0..-2]}
105
- ON DUPLICATE KEY UPDATE #{on_duplicate}
106
- SQL
107
- Database.execute(stmt)
108
- else
109
- #
110
- # General CASE based solution for multiple differing updates
111
- # set per row.
112
- # We use a CASE statement per column which determines the column
113
- # to set based on the period index
114
- #
115
- ids = existing.map {|_, (_, id)| id}
116
- updates = block_type::COLUMNS.map.with_index do |column, column_idx|
117
- [column, "CASE period_index\n#{existing.map{|period_index, (values, _)| "WHEN #{period_index} then #{values[column_idx]}"}.join("\n")} \nEND\n"]
118
- end.to_h
119
- update_statement = updates.map{|cl, up| "#{cl} = #{up}" }.join(', ')
120
- Database.execute(<<-SQL
121
- UPDATE #{block_type.table_name} SET #{update_statement} WHERE
122
- #{block_type.table_name}.data_frame_id IN (#{ids.join(',')})
123
- AND #{block_type.table_name}.data_frame_type = '#{data_frame_type.name}'
124
- AND #{block_type.table_name}.period_index IN (#{existing.keys.join(', ')});
125
- SQL
126
- )
127
143
  end
128
144
  end
129
145
 
130
146
  def bulk_delete(id, indices)
131
- block_type.where(data_frame_id: id, period_index: indices).delete_all
147
+ indices.each_slice(ActiveDataFrame.delete_max_batch_size) do |slice|
148
+ # puts "Deleting slice of #{slice.length}"
149
+ block_type.where(data_frame_id: id, period_index: slice).delete_all
150
+ end
132
151
  end
133
152
 
134
153
  ##
135
154
  # Insert block data for all blocks in a single call
136
155
  ##
137
- def bulk_insert(new_blocks, instance)
138
- inserts = ''
139
- new_blocks.each do |period_index, (values)|
140
- inserts << \
141
- case ActiveRecord::Base.connection_config[:adapter]
142
- when 'postgresql', 'mysql2' then "(#{values.map{|v| v.inspect.gsub('"',"'") }.join(',')}, #{instance.id}, #{period_index}, '#{data_frame_type.name}'),"
143
- else "(#{values.map{|v| v.inspect.gsub('"',"'") }.join(',')}, #{instance.id}, #{period_index}, '#{data_frame_type.name}'),"
156
+ def bulk_insert(new_blocks, columns=block_type::COLUMNS)
157
+ new_blocks.each_slice(ActiveDataFrame.insert_max_batch_size) do |new_blocks_slice|
158
+ # puts "Inserting slice of #{new_blocks_slice.length}"
159
+ inserts = ''
160
+ new_blocks_slice.each do |period_index, (values, df_id)|
161
+ inserts << \
162
+ case ActiveRecord::Base.connection_config[:adapter]
163
+ when 'postgresql', 'mysql2' then "(#{values.map{|v| v.inspect.gsub('"',"'") }.join(',')}, #{df_id}, #{period_index}, '#{data_frame_type.name}'),"
164
+ else "(#{values.map{|v| v.inspect.gsub('"',"'") }.join(',')}, #{df_id}, #{period_index}, '#{data_frame_type.name}'),"
165
+ end
144
166
  end
167
+ sql = "INSERT INTO #{block_type.table_name} (#{columns.join(',')}, data_frame_id, period_index, data_frame_type) VALUES #{inserts[0..-2]}"
168
+ Database.execute sql
145
169
  end
146
- sql = "INSERT INTO #{block_type.table_name} (#{block_type::COLUMNS.join(',')}, data_frame_id, period_index, data_frame_type) VALUES #{inserts[0..-2]}"
147
- Database.execute sql
148
170
  end
149
171
  end
150
172
  end
@@ -12,15 +12,72 @@ module ActiveDataFrame
12
12
  "#{data_frame_type.name} Row(#{instance.id})"
13
13
  end
14
14
 
15
- def set(from, values)
15
+ def self.set_all(scope, block_type, data_frame_type, from, values, trim: false)
16
+ if trim || ActiveRecord::Base.connection_config[:adapter] === 'mysql2'
17
+ scope.each do |instance|
18
+ Row.new(block_type, data_frame_type, instance).patch(from, values.kind_of?(Hash) ? values[instance.id] : values)
19
+ end
20
+ else
21
+ upsert_all(scope, block_type, data_frame_type, from, values)
22
+ end
23
+ end
24
+
25
+ def self.upsert_all(rows, block_type, data_frame_type, from, values)
26
+ length = values.kind_of?(Hash) ? values.values.first.length : values.length
27
+ to = from + length - 1
28
+ bounds = get_bounds(from, to, block_type)
29
+ scope = block_type.where(data_frame_type: data_frame_type.name, data_frame_id: rows.select(:id))
30
+ scope = scope.where(data_frame_id: values.keys) if values.kind_of?(Hash)
31
+ all_update_indices = scope.where(period_index: bounds.from.index..bounds.to.index).order(data_frame_id: :asc, period_index: :asc).pluck(:data_frame_id, :period_index)
32
+ grouped_update_indices = all_update_indices.group_by(&:first).transform_values{|value| Set.new(value.map!(&:last)) }
33
+ instance_ids = rows.pluck(:id)
34
+ instance_ids &= values.keys if values.kind_of?(Hash)
35
+ upserts = to_enum(:iterate_bounds, [bounds], block_type).flat_map do |index, left, right, cursor, size|
36
+ instance_ids.map do |instance_id|
37
+ slice = values.kind_of?(Hash) ? values[instance_id][cursor...cursor + size] : values[cursor...cursor + size]
38
+ [[:data_frame_id, instance_id], [:period_index, index], *(left.succ..right.succ).map{|v| :"t#{v}" }.zip(slice)].to_h
39
+ end
40
+ end
41
+
42
+ update, insert = upserts.partition{|upsert| grouped_update_indices[upsert[:data_frame_id]]&.include?(upsert[:period_index]) }
43
+ Database.for_types(block: block_type, df: data_frame_type).bulk_upsert(update, insert)
44
+ values
45
+ end
46
+
47
+ def set(from, values, trim: false)
48
+ if trim || ActiveRecord::Base.connection_config[:adapter] === 'mysql2'
49
+ patch(from, values)
50
+ else
51
+ upsert(from, values)
52
+ end
53
+ end
54
+
55
+ def upsert(from, values)
56
+ to = (from + values.length) - 1
57
+ bounds = get_bounds(from, to)
58
+ update_indices = Set.new(scope.where(period_index: bounds.from.index..bounds.to.index).order(period_index: :asc).pluck(:period_index))
59
+ # Detect blocks in bounds:
60
+ # - If existing and covered, do an update without load
61
+ # - If existing and uncovered, do a small write (without load)
62
+ # - If not existing, insert!
63
+ upserts = to_enum(:iterate_bounds, [bounds]).map do |index, left, right, cursor, size|
64
+ [[:data_frame_id, self.instance.id], [:period_index, index], *(left.succ..right.succ).map{|v| :"t#{v}" }.zip(values[cursor...cursor + size])].to_h
65
+ end
66
+ update, insert = upserts.partition{|upsert| update_indices.include?(upsert[:period_index]) }
67
+ database.bulk_upsert(update, insert)
68
+ values
69
+ end
70
+
71
+ def patch(from, values)
16
72
  to = (from + values.length) - 1
17
73
  bounds = get_bounds(from, to)
18
74
 
19
75
  new_blocks = Hash.new do |h, k|
20
- h[k] = [[0] * block_type::BLOCK_SIZE]
76
+ h[k] = [[0] * block_type::BLOCK_SIZE, self.instance.id]
21
77
  end
22
78
 
23
79
  deleted_indices = []
80
+
24
81
  existing = blocks_between([bounds]).pluck(:data_frame_id, :period_index, *block_type::COLUMNS).map do |id, period_index, *block_values|
25
82
  [period_index, [block_values, id]]
26
83
  end.to_h
@@ -41,8 +98,8 @@ module ActiveDataFrame
41
98
 
42
99
 
43
100
  database.bulk_delete(self.instance.id, deleted_indices) unless deleted_indices.size.zero?
44
- database.bulk_update(existing) unless existing.size.zero?
45
- database.bulk_insert(new_blocks, instance) unless new_blocks.size.zero?
101
+ database.bulk_update(existing) unless existing.size.zero?
102
+ database.bulk_insert(new_blocks) unless new_blocks.size.zero?
46
103
  values
47
104
  end
48
105
 
@@ -1,11 +1,9 @@
1
1
  module ActiveDataFrame
2
2
  class Table < DataFrameProxy
3
3
 
4
- def set(from, values)
4
+ def set(from, values, trim: false)
5
5
  ActiveDataFrame::Database.batch do
6
- data_frame_type.each do |instance|
7
- Row.new(self.block_type, self.data_frame_type, instance).set(from, values)
8
- end
6
+ Row.set_all(data_frame_type, self.block_type, self.data_frame_type, from, values, trim: trim)
9
7
  end
10
8
  end
11
9
 
@@ -1,3 +1,3 @@
1
1
  module ActiveDataFrame
2
- VERSION = "0.1.5"
2
+ VERSION = "0.1.6"
3
3
  end
@@ -87,7 +87,7 @@ RUBY
87
87
  t.integer :period_index
88
88
  #{
89
89
  columns.times.map do |i|
90
- " t.#{type} :t#{i+1}"
90
+ " t.#{type} :t#{i+1}, default: 0, allow_nil: false"
91
91
  end.join("\n")
92
92
  }
93
93
  RUBY
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: active_data_frame
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.5
4
+ version: 0.1.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Wouter Coppieters
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2018-06-19 00:00:00.000000000 Z
11
+ date: 2018-07-19 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler