active_data_frame 0.1.5 → 0.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/active_data_frame.rb +6 -3
- data/lib/active_data_frame/data_frame_proxy.rb +27 -2
- data/lib/active_data_frame/database.rb +90 -68
- data/lib/active_data_frame/row.rb +61 -4
- data/lib/active_data_frame/table.rb +2 -4
- data/lib/active_data_frame/version.rb +1 -1
- data/lib/generators/active_data_frame/install_generator.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 32b9b56e2515e43f2a30a10a35fd3c86d8de0bdf
|
4
|
+
data.tar.gz: 875f4e3f4cd9f19d4b0141b4c34a9420a36a06cd
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a4558730591012b79e19b19588a925e54b6678fdc279dc457bbad11c08ffdd3b2d4fd55c5d5668e90ed19c1ada58cd9ec73614fbfdbb72765f904ddd5c0509d1
|
7
|
+
data.tar.gz: f9b7852d03b01c23144c12780b87953e8d89fa039635e3bd2a7eb3029ad9ce0df6ee882c8bc1a2b902b7500a74d861aa8ace3720eb9d4b62b79abc5f40e8f9cc
|
data/lib/active_data_frame.rb
CHANGED
@@ -8,7 +8,10 @@ require 'rmatrix'
|
|
8
8
|
|
9
9
|
module ActiveDataFrame
|
10
10
|
CONFIG = OpenStruct.new({
|
11
|
-
suppress_logs:
|
11
|
+
suppress_logs: false,
|
12
|
+
insert_max_batch_size: 10_000,
|
13
|
+
update_max_batch_size: 10_000,
|
14
|
+
delete_max_batch_size: 10_000,
|
12
15
|
})
|
13
16
|
|
14
17
|
module_function
|
@@ -16,7 +19,7 @@ module ActiveDataFrame
|
|
16
19
|
yield CONFIG
|
17
20
|
end
|
18
21
|
|
19
|
-
|
20
|
-
CONFIG.
|
22
|
+
CONFIG.each_pair do |(key)|
|
23
|
+
define_method(key){ CONFIG.send(key) }
|
21
24
|
end
|
22
25
|
end
|
@@ -31,12 +31,27 @@ module ActiveDataFrame
|
|
31
31
|
def []=(from, values)
|
32
32
|
values = Array(values).flatten.map(&@value_map.method(:[])) if @value_map
|
33
33
|
from = column_map[from] if column_map && column_map[from]
|
34
|
-
|
34
|
+
if values.kind_of?(Hash)
|
35
|
+
values = verify_and_cleanse_hash_values(values)
|
36
|
+
else
|
37
|
+
values = M[values, typecode: block_type::TYPECODE].to_a.flatten
|
38
|
+
end
|
39
|
+
set(from, values)
|
40
|
+
end
|
41
|
+
|
42
|
+
def verify_and_cleanse_hash_values(map)
|
43
|
+
length = nil
|
44
|
+
map.transform_values do |values|
|
45
|
+
cleansed = M[values, typecode: block_type::TYPECODE].to_a.flatten
|
46
|
+
raise "All streams provided via a hash must be of the same length" if length && length != cleansed.length
|
47
|
+
length ||= cleansed.length
|
48
|
+
cleansed
|
49
|
+
end
|
35
50
|
end
|
36
51
|
|
37
52
|
def clear(*ranges)
|
38
53
|
extract_ranges(ranges).each do |r|
|
39
|
-
set(r.first, M.blank(columns: r.last - r.first, typecode: block_type::TYPECODE))
|
54
|
+
set(r.first, M.blank(columns: r.last - r.first, typecode: block_type::TYPECODE), trim: true)
|
40
55
|
end
|
41
56
|
end
|
42
57
|
|
@@ -96,6 +111,10 @@ module ActiveDataFrame
|
|
96
111
|
end
|
97
112
|
|
98
113
|
def get_bounds(from, to, index=0)
|
114
|
+
self.class.get_bounds(from, to, block_type, index)
|
115
|
+
end
|
116
|
+
|
117
|
+
def self.get_bounds(from, to, block_type, index=0)
|
99
118
|
from_block_index = from / block_type::BLOCK_SIZE
|
100
119
|
from_block_offset = from % block_type::BLOCK_SIZE
|
101
120
|
to_block_index = to / block_type::BLOCK_SIZE
|
@@ -117,6 +136,12 @@ module ActiveDataFrame
|
|
117
136
|
end
|
118
137
|
|
119
138
|
def iterate_bounds(all_bounds)
|
139
|
+
self.class.iterate_bounds(all_bounds, block_type) do |index, left, right, cursor, size|
|
140
|
+
yield index, left, right, cursor, size
|
141
|
+
end
|
142
|
+
end
|
143
|
+
|
144
|
+
def self.iterate_bounds(all_bounds, block_type)
|
120
145
|
cursor = 0
|
121
146
|
all_bounds.each do |bounds|
|
122
147
|
index = bounds.from.index
|
@@ -58,93 +58,115 @@ module ActiveDataFrame
|
|
58
58
|
self.batching = prev_batch
|
59
59
|
flush! unless self.batching
|
60
60
|
end
|
61
|
+
|
62
|
+
def bulk_upsert(updates, inserts)
|
63
|
+
Database.batch do
|
64
|
+
updates.group_by(&:keys).transform_values{|v| v.map(&:values) }.each do |columns, rows|
|
65
|
+
update = rows.map{|df_id, period_index, *values| [period_index, [values, df_id]] }
|
66
|
+
bulk_update(update, columns - [:data_frame_id, :period_index])
|
67
|
+
end
|
68
|
+
inserts.group_by(&:keys).transform_values{|v| v.map(&:values) }.each do |columns, rows|
|
69
|
+
insert = rows.map{|df_id, period_index, *values| [period_index, [values, df_id]] }
|
70
|
+
bulk_insert(insert, columns - [:data_frame_id, :period_index])
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
61
74
|
##
|
62
75
|
# Update block data for all blocks in a single call
|
63
76
|
##
|
64
|
-
def bulk_update(existing)
|
65
|
-
|
66
|
-
|
77
|
+
def bulk_update(existing, columns=block_type::COLUMNS)
|
78
|
+
existing.each_slice(ActiveDataFrame.update_max_batch_size) do |existing_slice|
|
79
|
+
# puts "Updating slice of #{existing_slice.length}"
|
80
|
+
case ActiveRecord::Base.connection_config[:adapter]
|
81
|
+
when 'postgresql'.freeze
|
82
|
+
#
|
83
|
+
# PostgreSQL Supports the fast setting of multiple update values that differ
|
84
|
+
# per row from a temporary table.
|
85
|
+
#
|
86
|
+
updates = ''
|
87
|
+
existing_slice.each do |period_index, (values, df_id)|
|
88
|
+
updates << "(#{df_id}, #{period_index}, #{values.map{|v| v.inspect.gsub('"',"'") }.join(',')}),"
|
89
|
+
end
|
90
|
+
Database.execute(
|
91
|
+
<<-SQL
|
92
|
+
UPDATE #{block_type.table_name}
|
93
|
+
SET #{columns.map{|col| "#{col} = t.#{col}" }.join(", ")}
|
94
|
+
FROM(
|
95
|
+
VALUES #{updates[0..-2]}) as t(data_frame_id, period_index, #{columns.join(',')})
|
96
|
+
WHERE #{block_type.table_name}.data_frame_id = t.data_frame_id
|
97
|
+
AND #{block_type.table_name}.period_index = t.period_index
|
98
|
+
AND #{block_type.table_name}.data_frame_type = '#{data_frame_type.name}'
|
99
|
+
SQL
|
100
|
+
)
|
67
101
|
#
|
68
|
-
#
|
69
|
-
#
|
102
|
+
# For MySQL we use the ON DUPLICATE KEY UPDATE functionality.
|
103
|
+
# This relies on there being a unique index dataframe and period index
|
104
|
+
# on the blocks table.
|
105
|
+
# This tends to be faster than the general CASE based solution below
|
106
|
+
# but slower than the PostgreSQL solution above
|
70
107
|
#
|
71
|
-
|
72
|
-
|
73
|
-
updates
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
108
|
+
when 'mysql2'.freeze
|
109
|
+
# Fast bulk update
|
110
|
+
updates, on_duplicate = "", ""
|
111
|
+
existing_slice.each do |period_index, (values, df_id)|
|
112
|
+
updates << "(#{values.map{|v| v.inspect.gsub('"',"'") }.join(',')}, #{df_id}, #{period_index}, '#{data_frame_type.name}'),"
|
113
|
+
end
|
114
|
+
on_duplicate = columns.map do |cname|
|
115
|
+
"#{cname}=VALUES(#{cname})"
|
116
|
+
end.join(", ")
|
117
|
+
stmt = <<-SQL
|
118
|
+
INSERT INTO #{block_type.table_name} (#{columns.join(',')},data_frame_id,period_index,data_frame_type)
|
119
|
+
VALUES #{updates[0..-2]}
|
120
|
+
ON DUPLICATE KEY UPDATE #{on_duplicate}
|
121
|
+
SQL
|
122
|
+
Database.execute(stmt)
|
123
|
+
else
|
124
|
+
#
|
125
|
+
# General CASE based solution for multiple differing updates
|
126
|
+
# set per row.
|
127
|
+
# We use a CASE statement per column which determines the column
|
128
|
+
# to set based on the period index
|
129
|
+
#
|
130
|
+
ids = existing_slice.map {|_, (_, id)| id}
|
131
|
+
updates = columns.map.with_index do |column, column_idx|
|
132
|
+
[column, "CASE period_index\n#{existing_slice.map{|period_index, (values, _)| "WHEN #{period_index} then #{values[column_idx]}"}.join("\n")} \nEND\n"]
|
133
|
+
end.to_h
|
134
|
+
update_statement = updates.map{|cl, up| "#{cl} = #{up}" }.join(', ')
|
135
|
+
Database.execute(<<-SQL
|
136
|
+
UPDATE #{block_type.table_name} SET #{update_statement} WHERE
|
137
|
+
#{block_type.table_name}.data_frame_id IN (#{ids.join(',')})
|
83
138
|
AND #{block_type.table_name}.data_frame_type = '#{data_frame_type.name}'
|
139
|
+
AND #{block_type.table_name}.period_index IN (#{existing_slice.map(&:first).join(', ')});
|
84
140
|
SQL
|
85
|
-
|
86
|
-
#
|
87
|
-
# For MySQL we use the ON DUPLICATE KEY UPDATE functionality.
|
88
|
-
# This relies on there being a unique index dataframe and period index
|
89
|
-
# on the blocks table.
|
90
|
-
# This tends to be faster than the general CASE based solution below
|
91
|
-
# but slower than the PostgreSQL solution above
|
92
|
-
#
|
93
|
-
when 'mysql2'.freeze
|
94
|
-
# Fast bulk update
|
95
|
-
updates, on_duplicate = "", ""
|
96
|
-
existing.each do |period_index, (values, df_id)|
|
97
|
-
updates << "(#{values.map{|v| v.inspect.gsub('"',"'") }.join(',')}, #{df_id}, #{period_index}, '#{data_frame_type.name}'),"
|
141
|
+
)
|
98
142
|
end
|
99
|
-
on_duplicate = block_type::COLUMNS.map do |cname|
|
100
|
-
"#{cname}=VALUES(#{cname})"
|
101
|
-
end.join(", ")
|
102
|
-
stmt = <<-SQL
|
103
|
-
INSERT INTO #{block_type.table_name} (#{block_type::COLUMNS.join(',')},data_frame_id,period_index,data_frame_type)
|
104
|
-
VALUES #{updates[0..-2]}
|
105
|
-
ON DUPLICATE KEY UPDATE #{on_duplicate}
|
106
|
-
SQL
|
107
|
-
Database.execute(stmt)
|
108
|
-
else
|
109
|
-
#
|
110
|
-
# General CASE based solution for multiple differing updates
|
111
|
-
# set per row.
|
112
|
-
# We use a CASE statement per column which determines the column
|
113
|
-
# to set based on the period index
|
114
|
-
#
|
115
|
-
ids = existing.map {|_, (_, id)| id}
|
116
|
-
updates = block_type::COLUMNS.map.with_index do |column, column_idx|
|
117
|
-
[column, "CASE period_index\n#{existing.map{|period_index, (values, _)| "WHEN #{period_index} then #{values[column_idx]}"}.join("\n")} \nEND\n"]
|
118
|
-
end.to_h
|
119
|
-
update_statement = updates.map{|cl, up| "#{cl} = #{up}" }.join(', ')
|
120
|
-
Database.execute(<<-SQL
|
121
|
-
UPDATE #{block_type.table_name} SET #{update_statement} WHERE
|
122
|
-
#{block_type.table_name}.data_frame_id IN (#{ids.join(',')})
|
123
|
-
AND #{block_type.table_name}.data_frame_type = '#{data_frame_type.name}'
|
124
|
-
AND #{block_type.table_name}.period_index IN (#{existing.keys.join(', ')});
|
125
|
-
SQL
|
126
|
-
)
|
127
143
|
end
|
128
144
|
end
|
129
145
|
|
130
146
|
def bulk_delete(id, indices)
|
131
|
-
|
147
|
+
indices.each_slice(ActiveDataFrame.delete_max_batch_size) do |slice|
|
148
|
+
# puts "Deleting slice of #{slice.length}"
|
149
|
+
block_type.where(data_frame_id: id, period_index: slice).delete_all
|
150
|
+
end
|
132
151
|
end
|
133
152
|
|
134
153
|
##
|
135
154
|
# Insert block data for all blocks in a single call
|
136
155
|
##
|
137
|
-
def bulk_insert(new_blocks,
|
138
|
-
|
139
|
-
|
140
|
-
inserts
|
141
|
-
|
142
|
-
|
143
|
-
|
156
|
+
def bulk_insert(new_blocks, columns=block_type::COLUMNS)
|
157
|
+
new_blocks.each_slice(ActiveDataFrame.insert_max_batch_size) do |new_blocks_slice|
|
158
|
+
# puts "Inserting slice of #{new_blocks_slice.length}"
|
159
|
+
inserts = ''
|
160
|
+
new_blocks_slice.each do |period_index, (values, df_id)|
|
161
|
+
inserts << \
|
162
|
+
case ActiveRecord::Base.connection_config[:adapter]
|
163
|
+
when 'postgresql', 'mysql2' then "(#{values.map{|v| v.inspect.gsub('"',"'") }.join(',')}, #{df_id}, #{period_index}, '#{data_frame_type.name}'),"
|
164
|
+
else "(#{values.map{|v| v.inspect.gsub('"',"'") }.join(',')}, #{df_id}, #{period_index}, '#{data_frame_type.name}'),"
|
165
|
+
end
|
144
166
|
end
|
167
|
+
sql = "INSERT INTO #{block_type.table_name} (#{columns.join(',')}, data_frame_id, period_index, data_frame_type) VALUES #{inserts[0..-2]}"
|
168
|
+
Database.execute sql
|
145
169
|
end
|
146
|
-
sql = "INSERT INTO #{block_type.table_name} (#{block_type::COLUMNS.join(',')}, data_frame_id, period_index, data_frame_type) VALUES #{inserts[0..-2]}"
|
147
|
-
Database.execute sql
|
148
170
|
end
|
149
171
|
end
|
150
172
|
end
|
@@ -12,15 +12,72 @@ module ActiveDataFrame
|
|
12
12
|
"#{data_frame_type.name} Row(#{instance.id})"
|
13
13
|
end
|
14
14
|
|
15
|
-
def
|
15
|
+
def self.set_all(scope, block_type, data_frame_type, from, values, trim: false)
|
16
|
+
if trim || ActiveRecord::Base.connection_config[:adapter] === 'mysql2'
|
17
|
+
scope.each do |instance|
|
18
|
+
Row.new(block_type, data_frame_type, instance).patch(from, values.kind_of?(Hash) ? values[instance.id] : values)
|
19
|
+
end
|
20
|
+
else
|
21
|
+
upsert_all(scope, block_type, data_frame_type, from, values)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
def self.upsert_all(rows, block_type, data_frame_type, from, values)
|
26
|
+
length = values.kind_of?(Hash) ? values.values.first.length : values.length
|
27
|
+
to = from + length - 1
|
28
|
+
bounds = get_bounds(from, to, block_type)
|
29
|
+
scope = block_type.where(data_frame_type: data_frame_type.name, data_frame_id: rows.select(:id))
|
30
|
+
scope = scope.where(data_frame_id: values.keys) if values.kind_of?(Hash)
|
31
|
+
all_update_indices = scope.where(period_index: bounds.from.index..bounds.to.index).order(data_frame_id: :asc, period_index: :asc).pluck(:data_frame_id, :period_index)
|
32
|
+
grouped_update_indices = all_update_indices.group_by(&:first).transform_values{|value| Set.new(value.map!(&:last)) }
|
33
|
+
instance_ids = rows.pluck(:id)
|
34
|
+
instance_ids &= values.keys if values.kind_of?(Hash)
|
35
|
+
upserts = to_enum(:iterate_bounds, [bounds], block_type).flat_map do |index, left, right, cursor, size|
|
36
|
+
instance_ids.map do |instance_id|
|
37
|
+
slice = values.kind_of?(Hash) ? values[instance_id][cursor...cursor + size] : values[cursor...cursor + size]
|
38
|
+
[[:data_frame_id, instance_id], [:period_index, index], *(left.succ..right.succ).map{|v| :"t#{v}" }.zip(slice)].to_h
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
update, insert = upserts.partition{|upsert| grouped_update_indices[upsert[:data_frame_id]]&.include?(upsert[:period_index]) }
|
43
|
+
Database.for_types(block: block_type, df: data_frame_type).bulk_upsert(update, insert)
|
44
|
+
values
|
45
|
+
end
|
46
|
+
|
47
|
+
def set(from, values, trim: false)
|
48
|
+
if trim || ActiveRecord::Base.connection_config[:adapter] === 'mysql2'
|
49
|
+
patch(from, values)
|
50
|
+
else
|
51
|
+
upsert(from, values)
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
def upsert(from, values)
|
56
|
+
to = (from + values.length) - 1
|
57
|
+
bounds = get_bounds(from, to)
|
58
|
+
update_indices = Set.new(scope.where(period_index: bounds.from.index..bounds.to.index).order(period_index: :asc).pluck(:period_index))
|
59
|
+
# Detect blocks in bounds:
|
60
|
+
# - If existing and covered, do an update without load
|
61
|
+
# - If existing and uncovered, do a small write (without load)
|
62
|
+
# - If not existing, insert!
|
63
|
+
upserts = to_enum(:iterate_bounds, [bounds]).map do |index, left, right, cursor, size|
|
64
|
+
[[:data_frame_id, self.instance.id], [:period_index, index], *(left.succ..right.succ).map{|v| :"t#{v}" }.zip(values[cursor...cursor + size])].to_h
|
65
|
+
end
|
66
|
+
update, insert = upserts.partition{|upsert| update_indices.include?(upsert[:period_index]) }
|
67
|
+
database.bulk_upsert(update, insert)
|
68
|
+
values
|
69
|
+
end
|
70
|
+
|
71
|
+
def patch(from, values)
|
16
72
|
to = (from + values.length) - 1
|
17
73
|
bounds = get_bounds(from, to)
|
18
74
|
|
19
75
|
new_blocks = Hash.new do |h, k|
|
20
|
-
h[k] = [[0] * block_type::BLOCK_SIZE]
|
76
|
+
h[k] = [[0] * block_type::BLOCK_SIZE, self.instance.id]
|
21
77
|
end
|
22
78
|
|
23
79
|
deleted_indices = []
|
80
|
+
|
24
81
|
existing = blocks_between([bounds]).pluck(:data_frame_id, :period_index, *block_type::COLUMNS).map do |id, period_index, *block_values|
|
25
82
|
[period_index, [block_values, id]]
|
26
83
|
end.to_h
|
@@ -41,8 +98,8 @@ module ActiveDataFrame
|
|
41
98
|
|
42
99
|
|
43
100
|
database.bulk_delete(self.instance.id, deleted_indices) unless deleted_indices.size.zero?
|
44
|
-
database.bulk_update(existing)
|
45
|
-
database.bulk_insert(new_blocks
|
101
|
+
database.bulk_update(existing) unless existing.size.zero?
|
102
|
+
database.bulk_insert(new_blocks) unless new_blocks.size.zero?
|
46
103
|
values
|
47
104
|
end
|
48
105
|
|
@@ -1,11 +1,9 @@
|
|
1
1
|
module ActiveDataFrame
|
2
2
|
class Table < DataFrameProxy
|
3
3
|
|
4
|
-
def set(from, values)
|
4
|
+
def set(from, values, trim: false)
|
5
5
|
ActiveDataFrame::Database.batch do
|
6
|
-
data_frame_type.
|
7
|
-
Row.new(self.block_type, self.data_frame_type, instance).set(from, values)
|
8
|
-
end
|
6
|
+
Row.set_all(data_frame_type, self.block_type, self.data_frame_type, from, values, trim: trim)
|
9
7
|
end
|
10
8
|
end
|
11
9
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: active_data_frame
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.6
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Wouter Coppieters
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2018-
|
11
|
+
date: 2018-07-19 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|