active_data_frame 0.1.5 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 6e6c248d13e0f7f10933eca32158e4fb33a080e3
4
- data.tar.gz: 28d52390deef35b8e582942612989f99e2026ed3
3
+ metadata.gz: 32b9b56e2515e43f2a30a10a35fd3c86d8de0bdf
4
+ data.tar.gz: 875f4e3f4cd9f19d4b0141b4c34a9420a36a06cd
5
5
  SHA512:
6
- metadata.gz: 9fd95c152778f43ea9d3d3e09160a22ed355989b5fdc5e2cbbfb1f10b2290aab4db07d04344acb595f30f28a98eaf1282b4a461611ba6c5999b5a060fc60ae77
7
- data.tar.gz: c0b32d8827258e8e8cf38e051758d8de8a1784bb2a4b19cdaada75af5b9b1541b3c45d15e6d2f033ca07ad1ec9422fc290c11b44e41fd2df5ef08b68bf805c47
6
+ metadata.gz: a4558730591012b79e19b19588a925e54b6678fdc279dc457bbad11c08ffdd3b2d4fd55c5d5668e90ed19c1ada58cd9ec73614fbfdbb72765f904ddd5c0509d1
7
+ data.tar.gz: f9b7852d03b01c23144c12780b87953e8d89fa039635e3bd2a7eb3029ad9ce0df6ee882c8bc1a2b902b7500a74d861aa8ace3720eb9d4b62b79abc5f40e8f9cc
@@ -8,7 +8,10 @@ require 'rmatrix'
8
8
 
9
9
  module ActiveDataFrame
10
10
  CONFIG = OpenStruct.new({
11
- suppress_logs: true
11
+ suppress_logs: false,
12
+ insert_max_batch_size: 10_000,
13
+ update_max_batch_size: 10_000,
14
+ delete_max_batch_size: 10_000,
12
15
  })
13
16
 
14
17
  module_function
@@ -16,7 +19,7 @@ module ActiveDataFrame
16
19
  yield CONFIG
17
20
  end
18
21
 
19
- def suppress_logs
20
- CONFIG.suppress_logs
22
+ CONFIG.each_pair do |(key)|
23
+ define_method(key){ CONFIG.send(key) }
21
24
  end
22
25
  end
@@ -31,12 +31,27 @@ module ActiveDataFrame
31
31
  def []=(from, values)
32
32
  values = Array(values).flatten.map(&@value_map.method(:[])) if @value_map
33
33
  from = column_map[from] if column_map && column_map[from]
34
- set(from, M[values, typecode: block_type::TYPECODE].to_a.flatten)
34
+ if values.kind_of?(Hash)
35
+ values = verify_and_cleanse_hash_values(values)
36
+ else
37
+ values = M[values, typecode: block_type::TYPECODE].to_a.flatten
38
+ end
39
+ set(from, values)
40
+ end
41
+
42
+ def verify_and_cleanse_hash_values(map)
43
+ length = nil
44
+ map.transform_values do |values|
45
+ cleansed = M[values, typecode: block_type::TYPECODE].to_a.flatten
46
+ raise "All streams provided via a hash must be of the same length" if length && length != cleansed.length
47
+ length ||= cleansed.length
48
+ cleansed
49
+ end
35
50
  end
36
51
 
37
52
  def clear(*ranges)
38
53
  extract_ranges(ranges).each do |r|
39
- set(r.first, M.blank(columns: r.last - r.first, typecode: block_type::TYPECODE))
54
+ set(r.first, M.blank(columns: r.last - r.first, typecode: block_type::TYPECODE), trim: true)
40
55
  end
41
56
  end
42
57
 
@@ -96,6 +111,10 @@ module ActiveDataFrame
96
111
  end
97
112
 
98
113
  def get_bounds(from, to, index=0)
114
+ self.class.get_bounds(from, to, block_type, index)
115
+ end
116
+
117
+ def self.get_bounds(from, to, block_type, index=0)
99
118
  from_block_index = from / block_type::BLOCK_SIZE
100
119
  from_block_offset = from % block_type::BLOCK_SIZE
101
120
  to_block_index = to / block_type::BLOCK_SIZE
@@ -117,6 +136,12 @@ module ActiveDataFrame
117
136
  end
118
137
 
119
138
  def iterate_bounds(all_bounds)
139
+ self.class.iterate_bounds(all_bounds, block_type) do |index, left, right, cursor, size|
140
+ yield index, left, right, cursor, size
141
+ end
142
+ end
143
+
144
+ def self.iterate_bounds(all_bounds, block_type)
120
145
  cursor = 0
121
146
  all_bounds.each do |bounds|
122
147
  index = bounds.from.index
@@ -58,93 +58,115 @@ module ActiveDataFrame
58
58
  self.batching = prev_batch
59
59
  flush! unless self.batching
60
60
  end
61
+
62
+ def bulk_upsert(updates, inserts)
63
+ Database.batch do
64
+ updates.group_by(&:keys).transform_values{|v| v.map(&:values) }.each do |columns, rows|
65
+ update = rows.map{|df_id, period_index, *values| [period_index, [values, df_id]] }
66
+ bulk_update(update, columns - [:data_frame_id, :period_index])
67
+ end
68
+ inserts.group_by(&:keys).transform_values{|v| v.map(&:values) }.each do |columns, rows|
69
+ insert = rows.map{|df_id, period_index, *values| [period_index, [values, df_id]] }
70
+ bulk_insert(insert, columns - [:data_frame_id, :period_index])
71
+ end
72
+ end
73
+ end
61
74
  ##
62
75
  # Update block data for all blocks in a single call
63
76
  ##
64
- def bulk_update(existing)
65
- case ActiveRecord::Base.connection_config[:adapter]
66
- when 'postgresql'.freeze
77
+ def bulk_update(existing, columns=block_type::COLUMNS)
78
+ existing.each_slice(ActiveDataFrame.update_max_batch_size) do |existing_slice|
79
+ # puts "Updating slice of #{existing_slice.length}"
80
+ case ActiveRecord::Base.connection_config[:adapter]
81
+ when 'postgresql'.freeze
82
+ #
83
+ # PostgreSQL Supports the fast setting of multiple update values that differ
84
+ # per row from a temporary table.
85
+ #
86
+ updates = ''
87
+ existing_slice.each do |period_index, (values, df_id)|
88
+ updates << "(#{df_id}, #{period_index}, #{values.map{|v| v.inspect.gsub('"',"'") }.join(',')}),"
89
+ end
90
+ Database.execute(
91
+ <<-SQL
92
+ UPDATE #{block_type.table_name}
93
+ SET #{columns.map{|col| "#{col} = t.#{col}" }.join(", ")}
94
+ FROM(
95
+ VALUES #{updates[0..-2]}) as t(data_frame_id, period_index, #{columns.join(',')})
96
+ WHERE #{block_type.table_name}.data_frame_id = t.data_frame_id
97
+ AND #{block_type.table_name}.period_index = t.period_index
98
+ AND #{block_type.table_name}.data_frame_type = '#{data_frame_type.name}'
99
+ SQL
100
+ )
67
101
  #
68
- # PostgreSQL Supports the fast setting of multiple update values that differ
69
- # per row from a temporary table.
102
+ # For MySQL we use the ON DUPLICATE KEY UPDATE functionality.
103
+ # This relies on there being a unique index dataframe and period index
104
+ # on the blocks table.
105
+ # This tends to be faster than the general CASE based solution below
106
+ # but slower than the PostgreSQL solution above
70
107
  #
71
- updates = ''
72
- existing.each do |period_index, (values, df_id)|
73
- updates << "(#{df_id}, #{period_index}, #{values.map{|v| v.inspect.gsub('"',"'") }.join(',')}),"
74
- end
75
- Database.execute(
76
- <<-SQL
77
- UPDATE #{block_type.table_name}
78
- SET #{block_type::COLUMNS.map{|col| "#{col} = t.#{col}" }.join(", ")}
79
- FROM(
80
- VALUES #{updates[0..-2]}) as t(data_frame_id, period_index, #{block_type::COLUMNS.join(',')})
81
- WHERE #{block_type.table_name}.data_frame_id = t.data_frame_id
82
- AND #{block_type.table_name}.period_index = t.period_index
108
+ when 'mysql2'.freeze
109
+ # Fast bulk update
110
+ updates, on_duplicate = "", ""
111
+ existing_slice.each do |period_index, (values, df_id)|
112
+ updates << "(#{values.map{|v| v.inspect.gsub('"',"'") }.join(',')}, #{df_id}, #{period_index}, '#{data_frame_type.name}'),"
113
+ end
114
+ on_duplicate = columns.map do |cname|
115
+ "#{cname}=VALUES(#{cname})"
116
+ end.join(", ")
117
+ stmt = <<-SQL
118
+ INSERT INTO #{block_type.table_name} (#{columns.join(',')},data_frame_id,period_index,data_frame_type)
119
+ VALUES #{updates[0..-2]}
120
+ ON DUPLICATE KEY UPDATE #{on_duplicate}
121
+ SQL
122
+ Database.execute(stmt)
123
+ else
124
+ #
125
+ # General CASE based solution for multiple differing updates
126
+ # set per row.
127
+ # We use a CASE statement per column which determines the column
128
+ # to set based on the period index
129
+ #
130
+ ids = existing_slice.map {|_, (_, id)| id}
131
+ updates = columns.map.with_index do |column, column_idx|
132
+ [column, "CASE period_index\n#{existing_slice.map{|period_index, (values, _)| "WHEN #{period_index} then #{values[column_idx]}"}.join("\n")} \nEND\n"]
133
+ end.to_h
134
+ update_statement = updates.map{|cl, up| "#{cl} = #{up}" }.join(', ')
135
+ Database.execute(<<-SQL
136
+ UPDATE #{block_type.table_name} SET #{update_statement} WHERE
137
+ #{block_type.table_name}.data_frame_id IN (#{ids.join(',')})
83
138
  AND #{block_type.table_name}.data_frame_type = '#{data_frame_type.name}'
139
+ AND #{block_type.table_name}.period_index IN (#{existing_slice.map(&:first).join(', ')});
84
140
  SQL
85
- )
86
- #
87
- # For MySQL we use the ON DUPLICATE KEY UPDATE functionality.
88
- # This relies on there being a unique index dataframe and period index
89
- # on the blocks table.
90
- # This tends to be faster than the general CASE based solution below
91
- # but slower than the PostgreSQL solution above
92
- #
93
- when 'mysql2'.freeze
94
- # Fast bulk update
95
- updates, on_duplicate = "", ""
96
- existing.each do |period_index, (values, df_id)|
97
- updates << "(#{values.map{|v| v.inspect.gsub('"',"'") }.join(',')}, #{df_id}, #{period_index}, '#{data_frame_type.name}'),"
141
+ )
98
142
  end
99
- on_duplicate = block_type::COLUMNS.map do |cname|
100
- "#{cname}=VALUES(#{cname})"
101
- end.join(", ")
102
- stmt = <<-SQL
103
- INSERT INTO #{block_type.table_name} (#{block_type::COLUMNS.join(',')},data_frame_id,period_index,data_frame_type)
104
- VALUES #{updates[0..-2]}
105
- ON DUPLICATE KEY UPDATE #{on_duplicate}
106
- SQL
107
- Database.execute(stmt)
108
- else
109
- #
110
- # General CASE based solution for multiple differing updates
111
- # set per row.
112
- # We use a CASE statement per column which determines the column
113
- # to set based on the period index
114
- #
115
- ids = existing.map {|_, (_, id)| id}
116
- updates = block_type::COLUMNS.map.with_index do |column, column_idx|
117
- [column, "CASE period_index\n#{existing.map{|period_index, (values, _)| "WHEN #{period_index} then #{values[column_idx]}"}.join("\n")} \nEND\n"]
118
- end.to_h
119
- update_statement = updates.map{|cl, up| "#{cl} = #{up}" }.join(', ')
120
- Database.execute(<<-SQL
121
- UPDATE #{block_type.table_name} SET #{update_statement} WHERE
122
- #{block_type.table_name}.data_frame_id IN (#{ids.join(',')})
123
- AND #{block_type.table_name}.data_frame_type = '#{data_frame_type.name}'
124
- AND #{block_type.table_name}.period_index IN (#{existing.keys.join(', ')});
125
- SQL
126
- )
127
143
  end
128
144
  end
129
145
 
130
146
  def bulk_delete(id, indices)
131
- block_type.where(data_frame_id: id, period_index: indices).delete_all
147
+ indices.each_slice(ActiveDataFrame.delete_max_batch_size) do |slice|
148
+ # puts "Deleting slice of #{slice.length}"
149
+ block_type.where(data_frame_id: id, period_index: slice).delete_all
150
+ end
132
151
  end
133
152
 
134
153
  ##
135
154
  # Insert block data for all blocks in a single call
136
155
  ##
137
- def bulk_insert(new_blocks, instance)
138
- inserts = ''
139
- new_blocks.each do |period_index, (values)|
140
- inserts << \
141
- case ActiveRecord::Base.connection_config[:adapter]
142
- when 'postgresql', 'mysql2' then "(#{values.map{|v| v.inspect.gsub('"',"'") }.join(',')}, #{instance.id}, #{period_index}, '#{data_frame_type.name}'),"
143
- else "(#{values.map{|v| v.inspect.gsub('"',"'") }.join(',')}, #{instance.id}, #{period_index}, '#{data_frame_type.name}'),"
156
+ def bulk_insert(new_blocks, columns=block_type::COLUMNS)
157
+ new_blocks.each_slice(ActiveDataFrame.insert_max_batch_size) do |new_blocks_slice|
158
+ # puts "Inserting slice of #{new_blocks_slice.length}"
159
+ inserts = ''
160
+ new_blocks_slice.each do |period_index, (values, df_id)|
161
+ inserts << \
162
+ case ActiveRecord::Base.connection_config[:adapter]
163
+ when 'postgresql', 'mysql2' then "(#{values.map{|v| v.inspect.gsub('"',"'") }.join(',')}, #{df_id}, #{period_index}, '#{data_frame_type.name}'),"
164
+ else "(#{values.map{|v| v.inspect.gsub('"',"'") }.join(',')}, #{df_id}, #{period_index}, '#{data_frame_type.name}'),"
165
+ end
144
166
  end
167
+ sql = "INSERT INTO #{block_type.table_name} (#{columns.join(',')}, data_frame_id, period_index, data_frame_type) VALUES #{inserts[0..-2]}"
168
+ Database.execute sql
145
169
  end
146
- sql = "INSERT INTO #{block_type.table_name} (#{block_type::COLUMNS.join(',')}, data_frame_id, period_index, data_frame_type) VALUES #{inserts[0..-2]}"
147
- Database.execute sql
148
170
  end
149
171
  end
150
172
  end
@@ -12,15 +12,72 @@ module ActiveDataFrame
12
12
  "#{data_frame_type.name} Row(#{instance.id})"
13
13
  end
14
14
 
15
- def set(from, values)
15
+ def self.set_all(scope, block_type, data_frame_type, from, values, trim: false)
16
+ if trim || ActiveRecord::Base.connection_config[:adapter] === 'mysql2'
17
+ scope.each do |instance|
18
+ Row.new(block_type, data_frame_type, instance).patch(from, values.kind_of?(Hash) ? values[instance.id] : values)
19
+ end
20
+ else
21
+ upsert_all(scope, block_type, data_frame_type, from, values)
22
+ end
23
+ end
24
+
25
+ def self.upsert_all(rows, block_type, data_frame_type, from, values)
26
+ length = values.kind_of?(Hash) ? values.values.first.length : values.length
27
+ to = from + length - 1
28
+ bounds = get_bounds(from, to, block_type)
29
+ scope = block_type.where(data_frame_type: data_frame_type.name, data_frame_id: rows.select(:id))
30
+ scope = scope.where(data_frame_id: values.keys) if values.kind_of?(Hash)
31
+ all_update_indices = scope.where(period_index: bounds.from.index..bounds.to.index).order(data_frame_id: :asc, period_index: :asc).pluck(:data_frame_id, :period_index)
32
+ grouped_update_indices = all_update_indices.group_by(&:first).transform_values{|value| Set.new(value.map!(&:last)) }
33
+ instance_ids = rows.pluck(:id)
34
+ instance_ids &= values.keys if values.kind_of?(Hash)
35
+ upserts = to_enum(:iterate_bounds, [bounds], block_type).flat_map do |index, left, right, cursor, size|
36
+ instance_ids.map do |instance_id|
37
+ slice = values.kind_of?(Hash) ? values[instance_id][cursor...cursor + size] : values[cursor...cursor + size]
38
+ [[:data_frame_id, instance_id], [:period_index, index], *(left.succ..right.succ).map{|v| :"t#{v}" }.zip(slice)].to_h
39
+ end
40
+ end
41
+
42
+ update, insert = upserts.partition{|upsert| grouped_update_indices[upsert[:data_frame_id]]&.include?(upsert[:period_index]) }
43
+ Database.for_types(block: block_type, df: data_frame_type).bulk_upsert(update, insert)
44
+ values
45
+ end
46
+
47
+ def set(from, values, trim: false)
48
+ if trim || ActiveRecord::Base.connection_config[:adapter] === 'mysql2'
49
+ patch(from, values)
50
+ else
51
+ upsert(from, values)
52
+ end
53
+ end
54
+
55
+ def upsert(from, values)
56
+ to = (from + values.length) - 1
57
+ bounds = get_bounds(from, to)
58
+ update_indices = Set.new(scope.where(period_index: bounds.from.index..bounds.to.index).order(period_index: :asc).pluck(:period_index))
59
+ # Detect blocks in bounds:
60
+ # - If existing and covered, do an update without load
61
+ # - If existing and uncovered, do a small write (without load)
62
+ # - If not existing, insert!
63
+ upserts = to_enum(:iterate_bounds, [bounds]).map do |index, left, right, cursor, size|
64
+ [[:data_frame_id, self.instance.id], [:period_index, index], *(left.succ..right.succ).map{|v| :"t#{v}" }.zip(values[cursor...cursor + size])].to_h
65
+ end
66
+ update, insert = upserts.partition{|upsert| update_indices.include?(upsert[:period_index]) }
67
+ database.bulk_upsert(update, insert)
68
+ values
69
+ end
70
+
71
+ def patch(from, values)
16
72
  to = (from + values.length) - 1
17
73
  bounds = get_bounds(from, to)
18
74
 
19
75
  new_blocks = Hash.new do |h, k|
20
- h[k] = [[0] * block_type::BLOCK_SIZE]
76
+ h[k] = [[0] * block_type::BLOCK_SIZE, self.instance.id]
21
77
  end
22
78
 
23
79
  deleted_indices = []
80
+
24
81
  existing = blocks_between([bounds]).pluck(:data_frame_id, :period_index, *block_type::COLUMNS).map do |id, period_index, *block_values|
25
82
  [period_index, [block_values, id]]
26
83
  end.to_h
@@ -41,8 +98,8 @@ module ActiveDataFrame
41
98
 
42
99
 
43
100
  database.bulk_delete(self.instance.id, deleted_indices) unless deleted_indices.size.zero?
44
- database.bulk_update(existing) unless existing.size.zero?
45
- database.bulk_insert(new_blocks, instance) unless new_blocks.size.zero?
101
+ database.bulk_update(existing) unless existing.size.zero?
102
+ database.bulk_insert(new_blocks) unless new_blocks.size.zero?
46
103
  values
47
104
  end
48
105
 
@@ -1,11 +1,9 @@
1
1
  module ActiveDataFrame
2
2
  class Table < DataFrameProxy
3
3
 
4
- def set(from, values)
4
+ def set(from, values, trim: false)
5
5
  ActiveDataFrame::Database.batch do
6
- data_frame_type.each do |instance|
7
- Row.new(self.block_type, self.data_frame_type, instance).set(from, values)
8
- end
6
+ Row.set_all(data_frame_type, self.block_type, self.data_frame_type, from, values, trim: trim)
9
7
  end
10
8
  end
11
9
 
@@ -1,3 +1,3 @@
1
1
  module ActiveDataFrame
2
- VERSION = "0.1.5"
2
+ VERSION = "0.1.6"
3
3
  end
@@ -87,7 +87,7 @@ RUBY
87
87
  t.integer :period_index
88
88
  #{
89
89
  columns.times.map do |i|
90
- " t.#{type} :t#{i+1}"
90
+ " t.#{type} :t#{i+1}, default: 0, allow_nil: false"
91
91
  end.join("\n")
92
92
  }
93
93
  RUBY
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: active_data_frame
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.5
4
+ version: 0.1.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Wouter Coppieters
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2018-06-19 00:00:00.000000000 Z
11
+ date: 2018-07-19 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler