active_data_frame 0.1.5 → 0.1.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/active_data_frame.rb +6 -3
- data/lib/active_data_frame/data_frame_proxy.rb +27 -2
- data/lib/active_data_frame/database.rb +90 -68
- data/lib/active_data_frame/row.rb +61 -4
- data/lib/active_data_frame/table.rb +2 -4
- data/lib/active_data_frame/version.rb +1 -1
- data/lib/generators/active_data_frame/install_generator.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 32b9b56e2515e43f2a30a10a35fd3c86d8de0bdf
|
4
|
+
data.tar.gz: 875f4e3f4cd9f19d4b0141b4c34a9420a36a06cd
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a4558730591012b79e19b19588a925e54b6678fdc279dc457bbad11c08ffdd3b2d4fd55c5d5668e90ed19c1ada58cd9ec73614fbfdbb72765f904ddd5c0509d1
|
7
|
+
data.tar.gz: f9b7852d03b01c23144c12780b87953e8d89fa039635e3bd2a7eb3029ad9ce0df6ee882c8bc1a2b902b7500a74d861aa8ace3720eb9d4b62b79abc5f40e8f9cc
|
data/lib/active_data_frame.rb
CHANGED
@@ -8,7 +8,10 @@ require 'rmatrix'
|
|
8
8
|
|
9
9
|
module ActiveDataFrame
|
10
10
|
CONFIG = OpenStruct.new({
|
11
|
-
suppress_logs:
|
11
|
+
suppress_logs: false,
|
12
|
+
insert_max_batch_size: 10_000,
|
13
|
+
update_max_batch_size: 10_000,
|
14
|
+
delete_max_batch_size: 10_000,
|
12
15
|
})
|
13
16
|
|
14
17
|
module_function
|
@@ -16,7 +19,7 @@ module ActiveDataFrame
|
|
16
19
|
yield CONFIG
|
17
20
|
end
|
18
21
|
|
19
|
-
|
20
|
-
CONFIG.
|
22
|
+
CONFIG.each_pair do |(key)|
|
23
|
+
define_method(key){ CONFIG.send(key) }
|
21
24
|
end
|
22
25
|
end
|
@@ -31,12 +31,27 @@ module ActiveDataFrame
|
|
31
31
|
def []=(from, values)
|
32
32
|
values = Array(values).flatten.map(&@value_map.method(:[])) if @value_map
|
33
33
|
from = column_map[from] if column_map && column_map[from]
|
34
|
-
|
34
|
+
if values.kind_of?(Hash)
|
35
|
+
values = verify_and_cleanse_hash_values(values)
|
36
|
+
else
|
37
|
+
values = M[values, typecode: block_type::TYPECODE].to_a.flatten
|
38
|
+
end
|
39
|
+
set(from, values)
|
40
|
+
end
|
41
|
+
|
42
|
+
def verify_and_cleanse_hash_values(map)
|
43
|
+
length = nil
|
44
|
+
map.transform_values do |values|
|
45
|
+
cleansed = M[values, typecode: block_type::TYPECODE].to_a.flatten
|
46
|
+
raise "All streams provided via a hash must be of the same length" if length && length != cleansed.length
|
47
|
+
length ||= cleansed.length
|
48
|
+
cleansed
|
49
|
+
end
|
35
50
|
end
|
36
51
|
|
37
52
|
def clear(*ranges)
|
38
53
|
extract_ranges(ranges).each do |r|
|
39
|
-
set(r.first, M.blank(columns: r.last - r.first, typecode: block_type::TYPECODE))
|
54
|
+
set(r.first, M.blank(columns: r.last - r.first, typecode: block_type::TYPECODE), trim: true)
|
40
55
|
end
|
41
56
|
end
|
42
57
|
|
@@ -96,6 +111,10 @@ module ActiveDataFrame
|
|
96
111
|
end
|
97
112
|
|
98
113
|
def get_bounds(from, to, index=0)
|
114
|
+
self.class.get_bounds(from, to, block_type, index)
|
115
|
+
end
|
116
|
+
|
117
|
+
def self.get_bounds(from, to, block_type, index=0)
|
99
118
|
from_block_index = from / block_type::BLOCK_SIZE
|
100
119
|
from_block_offset = from % block_type::BLOCK_SIZE
|
101
120
|
to_block_index = to / block_type::BLOCK_SIZE
|
@@ -117,6 +136,12 @@ module ActiveDataFrame
|
|
117
136
|
end
|
118
137
|
|
119
138
|
def iterate_bounds(all_bounds)
|
139
|
+
self.class.iterate_bounds(all_bounds, block_type) do |index, left, right, cursor, size|
|
140
|
+
yield index, left, right, cursor, size
|
141
|
+
end
|
142
|
+
end
|
143
|
+
|
144
|
+
def self.iterate_bounds(all_bounds, block_type)
|
120
145
|
cursor = 0
|
121
146
|
all_bounds.each do |bounds|
|
122
147
|
index = bounds.from.index
|
@@ -58,93 +58,115 @@ module ActiveDataFrame
|
|
58
58
|
self.batching = prev_batch
|
59
59
|
flush! unless self.batching
|
60
60
|
end
|
61
|
+
|
62
|
+
def bulk_upsert(updates, inserts)
|
63
|
+
Database.batch do
|
64
|
+
updates.group_by(&:keys).transform_values{|v| v.map(&:values) }.each do |columns, rows|
|
65
|
+
update = rows.map{|df_id, period_index, *values| [period_index, [values, df_id]] }
|
66
|
+
bulk_update(update, columns - [:data_frame_id, :period_index])
|
67
|
+
end
|
68
|
+
inserts.group_by(&:keys).transform_values{|v| v.map(&:values) }.each do |columns, rows|
|
69
|
+
insert = rows.map{|df_id, period_index, *values| [period_index, [values, df_id]] }
|
70
|
+
bulk_insert(insert, columns - [:data_frame_id, :period_index])
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
61
74
|
##
|
62
75
|
# Update block data for all blocks in a single call
|
63
76
|
##
|
64
|
-
def bulk_update(existing)
|
65
|
-
|
66
|
-
|
77
|
+
def bulk_update(existing, columns=block_type::COLUMNS)
|
78
|
+
existing.each_slice(ActiveDataFrame.update_max_batch_size) do |existing_slice|
|
79
|
+
# puts "Updating slice of #{existing_slice.length}"
|
80
|
+
case ActiveRecord::Base.connection_config[:adapter]
|
81
|
+
when 'postgresql'.freeze
|
82
|
+
#
|
83
|
+
# PostgreSQL Supports the fast setting of multiple update values that differ
|
84
|
+
# per row from a temporary table.
|
85
|
+
#
|
86
|
+
updates = ''
|
87
|
+
existing_slice.each do |period_index, (values, df_id)|
|
88
|
+
updates << "(#{df_id}, #{period_index}, #{values.map{|v| v.inspect.gsub('"',"'") }.join(',')}),"
|
89
|
+
end
|
90
|
+
Database.execute(
|
91
|
+
<<-SQL
|
92
|
+
UPDATE #{block_type.table_name}
|
93
|
+
SET #{columns.map{|col| "#{col} = t.#{col}" }.join(", ")}
|
94
|
+
FROM(
|
95
|
+
VALUES #{updates[0..-2]}) as t(data_frame_id, period_index, #{columns.join(',')})
|
96
|
+
WHERE #{block_type.table_name}.data_frame_id = t.data_frame_id
|
97
|
+
AND #{block_type.table_name}.period_index = t.period_index
|
98
|
+
AND #{block_type.table_name}.data_frame_type = '#{data_frame_type.name}'
|
99
|
+
SQL
|
100
|
+
)
|
67
101
|
#
|
68
|
-
#
|
69
|
-
#
|
102
|
+
# For MySQL we use the ON DUPLICATE KEY UPDATE functionality.
|
103
|
+
# This relies on there being a unique index dataframe and period index
|
104
|
+
# on the blocks table.
|
105
|
+
# This tends to be faster than the general CASE based solution below
|
106
|
+
# but slower than the PostgreSQL solution above
|
70
107
|
#
|
71
|
-
|
72
|
-
|
73
|
-
updates
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
108
|
+
when 'mysql2'.freeze
|
109
|
+
# Fast bulk update
|
110
|
+
updates, on_duplicate = "", ""
|
111
|
+
existing_slice.each do |period_index, (values, df_id)|
|
112
|
+
updates << "(#{values.map{|v| v.inspect.gsub('"',"'") }.join(',')}, #{df_id}, #{period_index}, '#{data_frame_type.name}'),"
|
113
|
+
end
|
114
|
+
on_duplicate = columns.map do |cname|
|
115
|
+
"#{cname}=VALUES(#{cname})"
|
116
|
+
end.join(", ")
|
117
|
+
stmt = <<-SQL
|
118
|
+
INSERT INTO #{block_type.table_name} (#{columns.join(',')},data_frame_id,period_index,data_frame_type)
|
119
|
+
VALUES #{updates[0..-2]}
|
120
|
+
ON DUPLICATE KEY UPDATE #{on_duplicate}
|
121
|
+
SQL
|
122
|
+
Database.execute(stmt)
|
123
|
+
else
|
124
|
+
#
|
125
|
+
# General CASE based solution for multiple differing updates
|
126
|
+
# set per row.
|
127
|
+
# We use a CASE statement per column which determines the column
|
128
|
+
# to set based on the period index
|
129
|
+
#
|
130
|
+
ids = existing_slice.map {|_, (_, id)| id}
|
131
|
+
updates = columns.map.with_index do |column, column_idx|
|
132
|
+
[column, "CASE period_index\n#{existing_slice.map{|period_index, (values, _)| "WHEN #{period_index} then #{values[column_idx]}"}.join("\n")} \nEND\n"]
|
133
|
+
end.to_h
|
134
|
+
update_statement = updates.map{|cl, up| "#{cl} = #{up}" }.join(', ')
|
135
|
+
Database.execute(<<-SQL
|
136
|
+
UPDATE #{block_type.table_name} SET #{update_statement} WHERE
|
137
|
+
#{block_type.table_name}.data_frame_id IN (#{ids.join(',')})
|
83
138
|
AND #{block_type.table_name}.data_frame_type = '#{data_frame_type.name}'
|
139
|
+
AND #{block_type.table_name}.period_index IN (#{existing_slice.map(&:first).join(', ')});
|
84
140
|
SQL
|
85
|
-
|
86
|
-
#
|
87
|
-
# For MySQL we use the ON DUPLICATE KEY UPDATE functionality.
|
88
|
-
# This relies on there being a unique index dataframe and period index
|
89
|
-
# on the blocks table.
|
90
|
-
# This tends to be faster than the general CASE based solution below
|
91
|
-
# but slower than the PostgreSQL solution above
|
92
|
-
#
|
93
|
-
when 'mysql2'.freeze
|
94
|
-
# Fast bulk update
|
95
|
-
updates, on_duplicate = "", ""
|
96
|
-
existing.each do |period_index, (values, df_id)|
|
97
|
-
updates << "(#{values.map{|v| v.inspect.gsub('"',"'") }.join(',')}, #{df_id}, #{period_index}, '#{data_frame_type.name}'),"
|
141
|
+
)
|
98
142
|
end
|
99
|
-
on_duplicate = block_type::COLUMNS.map do |cname|
|
100
|
-
"#{cname}=VALUES(#{cname})"
|
101
|
-
end.join(", ")
|
102
|
-
stmt = <<-SQL
|
103
|
-
INSERT INTO #{block_type.table_name} (#{block_type::COLUMNS.join(',')},data_frame_id,period_index,data_frame_type)
|
104
|
-
VALUES #{updates[0..-2]}
|
105
|
-
ON DUPLICATE KEY UPDATE #{on_duplicate}
|
106
|
-
SQL
|
107
|
-
Database.execute(stmt)
|
108
|
-
else
|
109
|
-
#
|
110
|
-
# General CASE based solution for multiple differing updates
|
111
|
-
# set per row.
|
112
|
-
# We use a CASE statement per column which determines the column
|
113
|
-
# to set based on the period index
|
114
|
-
#
|
115
|
-
ids = existing.map {|_, (_, id)| id}
|
116
|
-
updates = block_type::COLUMNS.map.with_index do |column, column_idx|
|
117
|
-
[column, "CASE period_index\n#{existing.map{|period_index, (values, _)| "WHEN #{period_index} then #{values[column_idx]}"}.join("\n")} \nEND\n"]
|
118
|
-
end.to_h
|
119
|
-
update_statement = updates.map{|cl, up| "#{cl} = #{up}" }.join(', ')
|
120
|
-
Database.execute(<<-SQL
|
121
|
-
UPDATE #{block_type.table_name} SET #{update_statement} WHERE
|
122
|
-
#{block_type.table_name}.data_frame_id IN (#{ids.join(',')})
|
123
|
-
AND #{block_type.table_name}.data_frame_type = '#{data_frame_type.name}'
|
124
|
-
AND #{block_type.table_name}.period_index IN (#{existing.keys.join(', ')});
|
125
|
-
SQL
|
126
|
-
)
|
127
143
|
end
|
128
144
|
end
|
129
145
|
|
130
146
|
def bulk_delete(id, indices)
|
131
|
-
|
147
|
+
indices.each_slice(ActiveDataFrame.delete_max_batch_size) do |slice|
|
148
|
+
# puts "Deleting slice of #{slice.length}"
|
149
|
+
block_type.where(data_frame_id: id, period_index: slice).delete_all
|
150
|
+
end
|
132
151
|
end
|
133
152
|
|
134
153
|
##
|
135
154
|
# Insert block data for all blocks in a single call
|
136
155
|
##
|
137
|
-
def bulk_insert(new_blocks,
|
138
|
-
|
139
|
-
|
140
|
-
inserts
|
141
|
-
|
142
|
-
|
143
|
-
|
156
|
+
def bulk_insert(new_blocks, columns=block_type::COLUMNS)
|
157
|
+
new_blocks.each_slice(ActiveDataFrame.insert_max_batch_size) do |new_blocks_slice|
|
158
|
+
# puts "Inserting slice of #{new_blocks_slice.length}"
|
159
|
+
inserts = ''
|
160
|
+
new_blocks_slice.each do |period_index, (values, df_id)|
|
161
|
+
inserts << \
|
162
|
+
case ActiveRecord::Base.connection_config[:adapter]
|
163
|
+
when 'postgresql', 'mysql2' then "(#{values.map{|v| v.inspect.gsub('"',"'") }.join(',')}, #{df_id}, #{period_index}, '#{data_frame_type.name}'),"
|
164
|
+
else "(#{values.map{|v| v.inspect.gsub('"',"'") }.join(',')}, #{df_id}, #{period_index}, '#{data_frame_type.name}'),"
|
165
|
+
end
|
144
166
|
end
|
167
|
+
sql = "INSERT INTO #{block_type.table_name} (#{columns.join(',')}, data_frame_id, period_index, data_frame_type) VALUES #{inserts[0..-2]}"
|
168
|
+
Database.execute sql
|
145
169
|
end
|
146
|
-
sql = "INSERT INTO #{block_type.table_name} (#{block_type::COLUMNS.join(',')}, data_frame_id, period_index, data_frame_type) VALUES #{inserts[0..-2]}"
|
147
|
-
Database.execute sql
|
148
170
|
end
|
149
171
|
end
|
150
172
|
end
|
@@ -12,15 +12,72 @@ module ActiveDataFrame
|
|
12
12
|
"#{data_frame_type.name} Row(#{instance.id})"
|
13
13
|
end
|
14
14
|
|
15
|
-
def
|
15
|
+
def self.set_all(scope, block_type, data_frame_type, from, values, trim: false)
|
16
|
+
if trim || ActiveRecord::Base.connection_config[:adapter] === 'mysql2'
|
17
|
+
scope.each do |instance|
|
18
|
+
Row.new(block_type, data_frame_type, instance).patch(from, values.kind_of?(Hash) ? values[instance.id] : values)
|
19
|
+
end
|
20
|
+
else
|
21
|
+
upsert_all(scope, block_type, data_frame_type, from, values)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def self.upsert_all(rows, block_type, data_frame_type, from, values)
|
26
|
+
length = values.kind_of?(Hash) ? values.values.first.length : values.length
|
27
|
+
to = from + length - 1
|
28
|
+
bounds = get_bounds(from, to, block_type)
|
29
|
+
scope = block_type.where(data_frame_type: data_frame_type.name, data_frame_id: rows.select(:id))
|
30
|
+
scope = scope.where(data_frame_id: values.keys) if values.kind_of?(Hash)
|
31
|
+
all_update_indices = scope.where(period_index: bounds.from.index..bounds.to.index).order(data_frame_id: :asc, period_index: :asc).pluck(:data_frame_id, :period_index)
|
32
|
+
grouped_update_indices = all_update_indices.group_by(&:first).transform_values{|value| Set.new(value.map!(&:last)) }
|
33
|
+
instance_ids = rows.pluck(:id)
|
34
|
+
instance_ids &= values.keys if values.kind_of?(Hash)
|
35
|
+
upserts = to_enum(:iterate_bounds, [bounds], block_type).flat_map do |index, left, right, cursor, size|
|
36
|
+
instance_ids.map do |instance_id|
|
37
|
+
slice = values.kind_of?(Hash) ? values[instance_id][cursor...cursor + size] : values[cursor...cursor + size]
|
38
|
+
[[:data_frame_id, instance_id], [:period_index, index], *(left.succ..right.succ).map{|v| :"t#{v}" }.zip(slice)].to_h
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
update, insert = upserts.partition{|upsert| grouped_update_indices[upsert[:data_frame_id]]&.include?(upsert[:period_index]) }
|
43
|
+
Database.for_types(block: block_type, df: data_frame_type).bulk_upsert(update, insert)
|
44
|
+
values
|
45
|
+
end
|
46
|
+
|
47
|
+
def set(from, values, trim: false)
|
48
|
+
if trim || ActiveRecord::Base.connection_config[:adapter] === 'mysql2'
|
49
|
+
patch(from, values)
|
50
|
+
else
|
51
|
+
upsert(from, values)
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
def upsert(from, values)
|
56
|
+
to = (from + values.length) - 1
|
57
|
+
bounds = get_bounds(from, to)
|
58
|
+
update_indices = Set.new(scope.where(period_index: bounds.from.index..bounds.to.index).order(period_index: :asc).pluck(:period_index))
|
59
|
+
# Detect blocks in bounds:
|
60
|
+
# - If existing and covered, do an update without load
|
61
|
+
# - If existing and uncovered, do a small write (without load)
|
62
|
+
# - If not existing, insert!
|
63
|
+
upserts = to_enum(:iterate_bounds, [bounds]).map do |index, left, right, cursor, size|
|
64
|
+
[[:data_frame_id, self.instance.id], [:period_index, index], *(left.succ..right.succ).map{|v| :"t#{v}" }.zip(values[cursor...cursor + size])].to_h
|
65
|
+
end
|
66
|
+
update, insert = upserts.partition{|upsert| update_indices.include?(upsert[:period_index]) }
|
67
|
+
database.bulk_upsert(update, insert)
|
68
|
+
values
|
69
|
+
end
|
70
|
+
|
71
|
+
def patch(from, values)
|
16
72
|
to = (from + values.length) - 1
|
17
73
|
bounds = get_bounds(from, to)
|
18
74
|
|
19
75
|
new_blocks = Hash.new do |h, k|
|
20
|
-
h[k] = [[0] * block_type::BLOCK_SIZE]
|
76
|
+
h[k] = [[0] * block_type::BLOCK_SIZE, self.instance.id]
|
21
77
|
end
|
22
78
|
|
23
79
|
deleted_indices = []
|
80
|
+
|
24
81
|
existing = blocks_between([bounds]).pluck(:data_frame_id, :period_index, *block_type::COLUMNS).map do |id, period_index, *block_values|
|
25
82
|
[period_index, [block_values, id]]
|
26
83
|
end.to_h
|
@@ -41,8 +98,8 @@ module ActiveDataFrame
|
|
41
98
|
|
42
99
|
|
43
100
|
database.bulk_delete(self.instance.id, deleted_indices) unless deleted_indices.size.zero?
|
44
|
-
database.bulk_update(existing)
|
45
|
-
database.bulk_insert(new_blocks
|
101
|
+
database.bulk_update(existing) unless existing.size.zero?
|
102
|
+
database.bulk_insert(new_blocks) unless new_blocks.size.zero?
|
46
103
|
values
|
47
104
|
end
|
48
105
|
|
@@ -1,11 +1,9 @@
|
|
1
1
|
module ActiveDataFrame
|
2
2
|
class Table < DataFrameProxy
|
3
3
|
|
4
|
-
def set(from, values)
|
4
|
+
def set(from, values, trim: false)
|
5
5
|
ActiveDataFrame::Database.batch do
|
6
|
-
data_frame_type.
|
7
|
-
Row.new(self.block_type, self.data_frame_type, instance).set(from, values)
|
8
|
-
end
|
6
|
+
Row.set_all(data_frame_type, self.block_type, self.data_frame_type, from, values, trim: trim)
|
9
7
|
end
|
10
8
|
end
|
11
9
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: active_data_frame
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Wouter Coppieters
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-
|
11
|
+
date: 2018-07-19 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|