upsert 0.2.1 → 0.2.2
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +6 -0
- data/README.md +9 -2
- data/lib/upsert.rb +2 -16
- data/lib/upsert/mysql2_client.rb +20 -36
- data/lib/upsert/pg_connection.rb +5 -5
- data/lib/upsert/pg_connection/column_definition.rb +4 -4
- data/lib/upsert/row.rb +19 -11
- data/lib/upsert/version.rb +1 -1
- data/test/test_mysql2.rb +66 -1
- data/test/test_pg.rb +1 -1
- data/test/test_sqlite.rb +1 -1
- metadata +1 -1
data/CHANGELOG
CHANGED
data/README.md
CHANGED
@@ -33,7 +33,7 @@ For bulk upserts, you probably still want to use `Upsert.stream`.
|
|
33
33
|
Pet.upsert({:name => 'Jerry'}, :breed => 'beagle')
|
34
34
|
Pet.upsert({:name => 'Pierre'}, :breed => 'tabby')
|
35
35
|
|
36
|
-
###
|
36
|
+
### The "fixed column set" gotcha
|
37
37
|
|
38
38
|
Currently, the first row you pass in determines the columns that will be used. That's useful for mass importing of many rows with the same columns, but is surprising if you're trying to use a single `Upsert` object to add arbitrary data. For example, this won't work:
|
39
39
|
|
@@ -47,7 +47,14 @@ You would need to use a new `Upsert` object. On the other hand, this is totally
|
|
47
47
|
Pet.upsert({:name => 'Jerry'}, :breed => 'beagle')
|
48
48
|
Pet.upsert({:tag_number => 456}, :spiel => 'great cat')
|
49
49
|
|
50
|
-
|
50
|
+
## Wishlist
|
51
|
+
|
52
|
+
Pull requests for any of these would be greatly appreciated:
|
53
|
+
|
54
|
+
1. Somebody who understands statistics should look at how I'm sampling rows in `Upsert::Mysql2_Client#estimate_variable_sql_bytesize`... I think we can assume that row sizes are random, so I don't think we actually have to select random elements.
|
55
|
+
2. Fix SQLite tests.
|
56
|
+
3. If you think there's a fix for the "fixed column set" gotcha...
|
57
|
+
4. Naming suggestions: should "document" be called "setters" or "attributes"? Should "stream" be "batch" instead?
|
51
58
|
|
52
59
|
## Real-world usage
|
53
60
|
|
data/lib/upsert.rb
CHANGED
@@ -49,6 +49,7 @@ class Upsert
|
|
49
49
|
USEC_SPRINTF = '%06d'
|
50
50
|
ISO8601_DATETIME = '%Y-%m-%d %H:%M:%S'
|
51
51
|
ISO8601_DATE = '%F'
|
52
|
+
NULL_WORD = 'NULL'
|
52
53
|
|
53
54
|
# @return [Mysql2::Client,Sqlite3::Database,PG::Connection,#raw_connection]
|
54
55
|
attr_reader :connection
|
@@ -120,7 +121,7 @@ class Upsert
|
|
120
121
|
def quote_value(v)
|
121
122
|
case v
|
122
123
|
when NilClass
|
123
|
-
|
124
|
+
NULL_WORD
|
124
125
|
when Upsert::Binary
|
125
126
|
quote_binary v # must be defined by base
|
126
127
|
when String
|
@@ -141,19 +142,4 @@ class Upsert
|
|
141
142
|
raise "not sure how to quote #{v.class}: #{v.inspect}"
|
142
143
|
end
|
143
144
|
end
|
144
|
-
|
145
|
-
# @private
|
146
|
-
def quote_idents(idents)
|
147
|
-
idents.map { |k| quote_ident(k) }.join(',') # must be defined by base
|
148
|
-
end
|
149
|
-
|
150
|
-
# @private
|
151
|
-
def quote_values(values)
|
152
|
-
values.map { |v| quote_value(v) }.join(',')
|
153
|
-
end
|
154
|
-
|
155
|
-
# @private
|
156
|
-
def quote_pairs(pairs)
|
157
|
-
pairs.map { |k, v| [quote_ident(k),quote_value(v)].join('=') }.join(',')
|
158
|
-
end
|
159
145
|
end
|
data/lib/upsert/mysql2_client.rb
CHANGED
@@ -1,6 +1,8 @@
|
|
1
1
|
class Upsert
|
2
2
|
# @private
|
3
3
|
module Mysql2_Client
|
4
|
+
SAMPLE = 0.1
|
5
|
+
|
4
6
|
def chunk
|
5
7
|
return if rows.empty?
|
6
8
|
all = rows.length
|
@@ -11,9 +13,9 @@ class Upsert
|
|
11
13
|
if async? and take == all
|
12
14
|
return
|
13
15
|
end
|
14
|
-
while take >
|
16
|
+
while take > 2 and oversize?(take)
|
15
17
|
$stderr.puts " Length prediction via sampling failed, shrinking" if ENV['UPSERT_DEBUG'] == 'true'
|
16
|
-
take -=
|
18
|
+
take -= 2
|
17
19
|
end
|
18
20
|
chunk = sql take
|
19
21
|
while take > 1 and chunk.bytesize > max_sql_bytesize
|
@@ -46,14 +48,13 @@ class Upsert
|
|
46
48
|
end
|
47
49
|
|
48
50
|
def insert_part
|
49
|
-
@insert_part ||= %{INSERT INTO "#{table_name}" (#{
|
51
|
+
@insert_part ||= %{INSERT INTO "#{table_name}" (#{columns.join(',')}) VALUES }
|
50
52
|
end
|
51
53
|
|
52
54
|
def update_part
|
53
55
|
@update_part ||= begin
|
54
56
|
updaters = columns.map do |k|
|
55
|
-
|
56
|
-
[ qk, "VALUES(#{qk})" ].join('=')
|
57
|
+
[ k, "VALUES(#{k})" ].join('=')
|
57
58
|
end.join(',')
|
58
59
|
%{ ON DUPLICATE KEY UPDATE #{updaters}}
|
59
60
|
end
|
@@ -64,13 +65,18 @@ class Upsert
|
|
64
65
|
@static_sql_bytesize ||= insert_part.bytesize + update_part.bytesize + 2
|
65
66
|
end
|
66
67
|
|
67
|
-
|
68
|
+
|
68
69
|
def variable_sql_bytesize(take)
|
69
|
-
rows.first(take).inject(0) { |sum, row| sum + row.values_sql_bytesize
|
70
|
+
memo = rows.first(take).inject(0) { |sum, row| sum + row.values_sql_bytesize }
|
71
|
+
if take > 0
|
72
|
+
# parens and comma
|
73
|
+
memo += 3*(take-1)
|
74
|
+
end
|
75
|
+
memo
|
70
76
|
end
|
71
77
|
|
72
78
|
def estimate_variable_sql_bytesize(take)
|
73
|
-
n = (take
|
79
|
+
n = (take * SAMPLE).ceil
|
74
80
|
sample = if RUBY_VERSION >= '1.9'
|
75
81
|
rows.first(take).sample(n)
|
76
82
|
else
|
@@ -82,7 +88,12 @@ class Upsert
|
|
82
88
|
end
|
83
89
|
memo.first(n)
|
84
90
|
end
|
85
|
-
|
91
|
+
memo = sample.inject(0) { |sum, row| sum + row.values_sql_bytesize } / SAMPLE
|
92
|
+
if take > 0
|
93
|
+
# parens and comma
|
94
|
+
memo += 3*(take-1)
|
95
|
+
end
|
96
|
+
memo
|
86
97
|
end
|
87
98
|
|
88
99
|
def sql_bytesize(take)
|
@@ -112,33 +123,6 @@ class Upsert
|
|
112
123
|
end
|
113
124
|
end
|
114
125
|
|
115
|
-
def quoted_value_bytesize(v)
|
116
|
-
case v
|
117
|
-
when NilClass
|
118
|
-
4
|
119
|
-
when TrueClass
|
120
|
-
4
|
121
|
-
when FalseClass
|
122
|
-
5
|
123
|
-
when BigDecimal
|
124
|
-
v.to_s('F').bytesize
|
125
|
-
when Upsert::Binary
|
126
|
-
v.bytesize * 2 + 3
|
127
|
-
when Numeric
|
128
|
-
v.to_s.bytesize
|
129
|
-
when String
|
130
|
-
v.bytesize + 2
|
131
|
-
when Symbol
|
132
|
-
v.to_s.bytesize + 2
|
133
|
-
when Time, DateTime
|
134
|
-
24 + 2
|
135
|
-
when Date
|
136
|
-
10 + 2
|
137
|
-
else
|
138
|
-
raise "not sure how to get quoted length of #{v.class}: #{v.inspect}"
|
139
|
-
end
|
140
|
-
end
|
141
|
-
|
142
126
|
def quote_boolean(v)
|
143
127
|
v ? 'TRUE' : 'FALSE'
|
144
128
|
end
|
data/lib/upsert/pg_connection.rb
CHANGED
@@ -14,9 +14,9 @@ class Upsert
|
|
14
14
|
end
|
15
15
|
hsh = row.to_hash
|
16
16
|
ordered_args = column_definitions.map do |c|
|
17
|
-
hsh[c.name]
|
17
|
+
hsh[c.name] || NULL_WORD
|
18
18
|
end
|
19
|
-
%{SELECT #{merge_function}(#{
|
19
|
+
%{SELECT #{merge_function}(#{ordered_args.join(',')})}
|
20
20
|
end
|
21
21
|
|
22
22
|
def execute(sql)
|
@@ -56,12 +56,12 @@ class Upsert
|
|
56
56
|
def create_merge_function(example_row)
|
57
57
|
@merge_function = "pg_temp.merge_#{table_name}_#{Kernel.rand(1e11)}"
|
58
58
|
execute <<-EOS
|
59
|
-
CREATE FUNCTION #{merge_function}(#{column_definitions.map { |c| "#{
|
59
|
+
CREATE FUNCTION #{merge_function}(#{column_definitions.map { |c| "#{c.input_name} #{c.sql_type} DEFAULT #{c.default || 'NULL'}" }.join(',') }) RETURNS VOID AS
|
60
60
|
$$
|
61
61
|
BEGIN
|
62
62
|
LOOP
|
63
63
|
-- first try to update the key
|
64
|
-
UPDATE #{table_name} SET #{column_definitions.map { |c| "#{
|
64
|
+
UPDATE #{table_name} SET #{column_definitions.map { |c| "#{c.name} = #{c.input_name}" }.join(',')} WHERE #{example_row.raw_selector.keys.map { |k| "#{quote_ident(k)} = #{quote_ident([k,'input'].join('_'))}" }.join(' AND ') };
|
65
65
|
IF found THEN
|
66
66
|
RETURN;
|
67
67
|
END IF;
|
@@ -69,7 +69,7 @@ BEGIN
|
|
69
69
|
-- if someone else inserts the same key concurrently,
|
70
70
|
-- we could get a unique-key failure
|
71
71
|
BEGIN
|
72
|
-
INSERT INTO #{table_name}(#{column_definitions.map { |c|
|
72
|
+
INSERT INTO #{table_name}(#{column_definitions.map { |c| c.name }.join(',')}) VALUES (#{column_definitions.map { |c| c.input_name }.join(',')});
|
73
73
|
RETURN;
|
74
74
|
EXCEPTION WHEN unique_violation THEN
|
75
75
|
-- Do nothing, and loop to try the UPDATE again.
|
@@ -39,7 +39,7 @@ EOS
|
|
39
39
|
res.reject do |row|
|
40
40
|
row['name'] == auto_increment_primary_key
|
41
41
|
end.map do |row|
|
42
|
-
new row['name'], row['sql_type'], row['default']
|
42
|
+
new connection, row['name'], row['sql_type'], row['default']
|
43
43
|
end
|
44
44
|
end
|
45
45
|
end
|
@@ -49,9 +49,9 @@ EOS
|
|
49
49
|
attr_reader :sql_type
|
50
50
|
attr_reader :default
|
51
51
|
|
52
|
-
def initialize(
|
53
|
-
@name =
|
54
|
-
@input_name = "#{
|
52
|
+
def initialize(connection, raw_name, sql_type, default)
|
53
|
+
@name = connection.quote_ident raw_name
|
54
|
+
@input_name = connection.quote_ident "#{raw_name}_input"
|
55
55
|
@sql_type = sql_type
|
56
56
|
@default = default
|
57
57
|
end
|
data/lib/upsert/row.rb
CHANGED
@@ -2,13 +2,21 @@ class Upsert
|
|
2
2
|
# @private
|
3
3
|
class Row
|
4
4
|
attr_reader :parent
|
5
|
+
attr_reader :raw_selector
|
5
6
|
attr_reader :selector
|
6
7
|
attr_reader :document
|
7
8
|
|
8
|
-
def initialize(parent,
|
9
|
+
def initialize(parent, raw_selector, raw_document)
|
9
10
|
@parent = parent
|
10
|
-
@
|
11
|
-
@
|
11
|
+
@raw_selector = raw_selector
|
12
|
+
@selector = raw_selector.inject({}) do |memo, (k, v)|
|
13
|
+
memo[parent.quote_ident(k)] = parent.quote_value(v)
|
14
|
+
memo
|
15
|
+
end
|
16
|
+
@document = raw_document.inject({}) do |memo, (k, v)|
|
17
|
+
memo[parent.quote_ident(k)] = parent.quote_value(v)
|
18
|
+
memo
|
19
|
+
end
|
12
20
|
end
|
13
21
|
|
14
22
|
def columns
|
@@ -16,40 +24,40 @@ class Upsert
|
|
16
24
|
end
|
17
25
|
|
18
26
|
def values_sql_bytesize
|
19
|
-
@values_sql_bytesize ||= pairs.inject(0) { |sum, (_, v)| sum +
|
27
|
+
@values_sql_bytesize ||= pairs.inject(0) { |sum, (_, v)| sum + v.to_s.bytesize } + columns.length - 1
|
20
28
|
end
|
21
29
|
|
22
30
|
def values_sql
|
23
|
-
|
31
|
+
pairs.map { |_, v| v }.join(',')
|
24
32
|
end
|
25
33
|
|
26
34
|
def columns_sql
|
27
|
-
|
35
|
+
pairs.map { |k, _| k }.join(',')
|
28
36
|
end
|
29
37
|
|
30
38
|
def where_sql
|
31
|
-
|
39
|
+
selector.map { |k, v| [k, v].join('=') }.join(',')
|
32
40
|
end
|
33
41
|
|
34
42
|
def set_sql
|
35
|
-
|
43
|
+
pairs.map { |k, v| [k, v].join('=') }.join(',')
|
36
44
|
end
|
37
45
|
|
38
46
|
def pairs
|
39
47
|
@pairs ||= columns.map do |k|
|
40
|
-
|
48
|
+
v = if document.has_key?(k)
|
41
49
|
# prefer the document so that you can change rows
|
42
50
|
document[k]
|
43
51
|
else
|
44
52
|
selector[k]
|
45
53
|
end
|
46
|
-
[ k,
|
54
|
+
[ k, v ]
|
47
55
|
end
|
48
56
|
end
|
49
57
|
|
50
58
|
def to_hash
|
51
59
|
@to_hash ||= pairs.inject({}) do |memo, (k, v)|
|
52
|
-
memo[k
|
60
|
+
memo[k] = v
|
53
61
|
memo
|
54
62
|
end
|
55
63
|
end
|
data/lib/upsert/version.rb
CHANGED
data/test/test_mysql2.rb
CHANGED
@@ -4,7 +4,7 @@ require 'mysql2'
|
|
4
4
|
system %{ mysql -u root -ppassword -e "DROP DATABASE IF EXISTS test_upsert; CREATE DATABASE test_upsert CHARSET utf8" }
|
5
5
|
ActiveRecord::Base.establish_connection :adapter => 'mysql2', :username => 'root', :password => 'password', :database => 'test_upsert'
|
6
6
|
|
7
|
-
describe
|
7
|
+
describe Upsert::Mysql2_Client do
|
8
8
|
before do
|
9
9
|
@opened_connections = []
|
10
10
|
ActiveRecord::Base.connection.drop_table(Pet.table_name) rescue nil
|
@@ -38,4 +38,69 @@ describe "upserting on mysql2" do
|
|
38
38
|
it_also "doesn't mess with timezones"
|
39
39
|
|
40
40
|
it_also "doesn't blow up on reserved words"
|
41
|
+
|
42
|
+
describe '#sql_bytesize' do
|
43
|
+
def assert_exact(selector_proc, document_proc, show = false)
|
44
|
+
upsert = Upsert.new connection, :pets
|
45
|
+
0.upto(256) do |i|
|
46
|
+
upsert.rows << Upsert::Row.new(upsert, selector_proc.call(i), document_proc.call(i))
|
47
|
+
i.upto(upsert.rows.length) do |take|
|
48
|
+
expected_sql = upsert.sql(take)
|
49
|
+
actual = upsert.sql_bytesize(take)
|
50
|
+
if show and actual != expected_sql.bytesize
|
51
|
+
$stderr.puts
|
52
|
+
$stderr.puts "Expected: #{expected_sql.bytesize}"
|
53
|
+
$stderr.puts "Actual: #{actual}"
|
54
|
+
$stderr.puts expected_sql
|
55
|
+
end
|
56
|
+
actual.must_equal expected_sql.bytesize
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
def rand_string(length)
|
61
|
+
# http://www.dzone.com/snippets/generate-random-string-letters
|
62
|
+
# Array.new(length) { (rand(122-97) + 97).chr }.join
|
63
|
+
if RUBY_VERSION >= '1.9'
|
64
|
+
Array.new(length) { rand(512).chr(Encoding::UTF_8) }.join
|
65
|
+
else
|
66
|
+
Array.new(length) { rand(512) }.pack('C*')
|
67
|
+
end
|
68
|
+
end
|
69
|
+
it "is exact as selector length changes" do
|
70
|
+
selector_proc = proc do |i|
|
71
|
+
{ :name => rand_string(i) }
|
72
|
+
end
|
73
|
+
document_proc = proc do |i|
|
74
|
+
{}
|
75
|
+
end
|
76
|
+
assert_exact selector_proc, document_proc
|
77
|
+
end
|
78
|
+
it "is exact as value length changes" do
|
79
|
+
selector_proc = proc do |i|
|
80
|
+
{ :name => 'Jerry' }
|
81
|
+
end
|
82
|
+
document_proc = proc do |i|
|
83
|
+
{ :spiel => rand_string(i) }
|
84
|
+
end
|
85
|
+
assert_exact selector_proc, document_proc
|
86
|
+
end
|
87
|
+
it "is exact as both selector and value length change" do
|
88
|
+
selector_proc = proc do |i|
|
89
|
+
{ :name => rand_string(i) }
|
90
|
+
end
|
91
|
+
document_proc = proc do |i|
|
92
|
+
{ :spiel => rand_string(i) }
|
93
|
+
end
|
94
|
+
assert_exact selector_proc, document_proc
|
95
|
+
end
|
96
|
+
it "is exact with numbers too" do
|
97
|
+
selector_proc = proc do |i|
|
98
|
+
{ :tag_number => rand(1e5) }
|
99
|
+
end
|
100
|
+
document_proc = proc do |i|
|
101
|
+
{ :lovability => rand }
|
102
|
+
end
|
103
|
+
assert_exact selector_proc, document_proc
|
104
|
+
end
|
105
|
+
end
|
41
106
|
end
|
data/test/test_pg.rb
CHANGED
@@ -5,7 +5,7 @@ system %{ dropdb test_upsert }
|
|
5
5
|
system %{ createdb test_upsert }
|
6
6
|
ActiveRecord::Base.establish_connection :adapter => 'postgresql', :database => 'test_upsert'
|
7
7
|
|
8
|
-
describe
|
8
|
+
describe Upsert::PG_Connection do
|
9
9
|
before do
|
10
10
|
@opened_connections = []
|
11
11
|
ActiveRecord::Base.connection.drop_table(Pet.table_name) rescue nil
|
data/test/test_sqlite.rb
CHANGED
@@ -6,7 +6,7 @@ FileUtils.mkdir_p File.dirname(db_path)
|
|
6
6
|
FileUtils.rm_f db_path
|
7
7
|
ActiveRecord::Base.establish_connection :adapter => 'sqlite3', :database => db_path
|
8
8
|
|
9
|
-
describe
|
9
|
+
describe Upsert::SQLite3_Database do
|
10
10
|
before do
|
11
11
|
@opened_connections = []
|
12
12
|
ActiveRecord::Base.connection.drop_table(Pet.table_name) rescue nil
|