upsert 0.1.2 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +1 -0
- data/README.md +14 -10
- data/Rakefile +1 -1
- data/lib/upsert.rb +95 -10
- data/lib/upsert/active_record_upsert.rb +12 -0
- data/lib/upsert/mysql2_client.rb +160 -0
- data/lib/upsert/pg_connection.rb +84 -0
- data/lib/upsert/pg_connection/column_definition.rb +60 -0
- data/lib/upsert/row.rb +8 -8
- data/lib/upsert/sqlite3_database.rb +39 -0
- data/lib/upsert/version.rb +1 -1
- data/test/misc/get_postgres_reserved_words.rb +12 -0
- data/test/misc/mysql_reserved.txt +226 -0
- data/test/misc/pg_reserved.txt +742 -0
- data/test/shared/multibyte.rb +2 -2
- data/test/shared/reserved_words.rb +41 -0
- data/test/test_active_record_upsert.rb +23 -0
- data/test/test_mysql2.rb +2 -0
- data/test/test_pg.rb +2 -0
- data/test/test_sqlite.rb +14 -11
- metadata +17 -8
- data/lib/upsert/buffer.rb +0 -58
- data/lib/upsert/buffer/mysql2_client.rb +0 -164
- data/lib/upsert/buffer/pg_connection.rb +0 -87
- data/lib/upsert/buffer/pg_connection/column_definition.rb +0 -60
- data/lib/upsert/buffer/sqlite3_database.rb +0 -43
- data/lib/upsert/quoter.rb +0 -43
data/.gitignore
CHANGED
data/README.md
CHANGED
@@ -18,7 +18,9 @@ Let's say you have...
|
|
18
18
|
document = {:breed => 'beagle'}
|
19
19
|
upsert.row selector, document
|
20
20
|
|
21
|
-
###
|
21
|
+
### Streaming upserts (fastest)
|
22
|
+
|
23
|
+
Rows are buffered in memory until it's efficient to send them to the database.
|
22
24
|
|
23
25
|
Upsert.stream(Pet.connection, Pet.table_name) do |upsert|
|
24
26
|
# [...]
|
@@ -28,7 +30,14 @@ Let's say you have...
|
|
28
30
|
# [...]
|
29
31
|
end
|
30
32
|
|
31
|
-
|
33
|
+
### With a helper method
|
34
|
+
|
35
|
+
For bulk upserts, you probably still want to use `Upsert.stream`.
|
36
|
+
|
37
|
+
# be sure to require 'upsert/active_record_upsert' - it's not required by default
|
38
|
+
selector = {:name => 'Jerry'}
|
39
|
+
document = {:breed => 'beagle'}
|
40
|
+
Pet.upsert selector, document
|
32
41
|
|
33
42
|
## Real-world usage
|
34
43
|
|
@@ -54,9 +63,9 @@ Using the [mysql2](https://rubygems.org/gems/mysql2) driver.
|
|
54
63
|
From the tests:
|
55
64
|
|
56
65
|
Upsert was 77% faster than find + new/set/save
|
57
|
-
Upsert was
|
58
|
-
Upsert was
|
59
|
-
Upsert was
|
66
|
+
Upsert was 58% faster than create + rescue/find/update
|
67
|
+
Upsert was 80% faster than find_or_create + update_attributes
|
68
|
+
Upsert was 39% faster than faking upserts with activerecord-import
|
60
69
|
|
61
70
|
#### SQL MERGE trick
|
62
71
|
|
@@ -199,11 +208,6 @@ This, however, only works on MySQL and requires ActiveRecord—and if all yo
|
|
199
208
|
|
200
209
|
The `selector` and `document` arguments are inspired by the upsert functionality of the [mongo-ruby-driver's update method](http://api.mongodb.org/ruby/1.6.4/Mongo/Collection.html#update-instance_method).
|
201
210
|
|
202
|
-
## Wishlist
|
203
|
-
|
204
|
-
1. `Pet.upsert`... duh
|
205
|
-
2. Don't need a separate buffer class... just extend an instance of Upsert with the appropriate database driver module.
|
206
|
-
|
207
211
|
## Copyright
|
208
212
|
|
209
213
|
Copyright 2012 Brighter Planet, Inc.
|
data/Rakefile
CHANGED
@@ -10,7 +10,7 @@ Rake::TestTask.new(:_test) do |test|
|
|
10
10
|
end
|
11
11
|
|
12
12
|
task :test_each_db_adapter do
|
13
|
-
%w{ mysql2 sqlite pg active_record_connection_adapter }.each do |database|
|
13
|
+
%w{ active_record_upsert mysql2 sqlite pg active_record_connection_adapter }.each do |database|
|
14
14
|
puts
|
15
15
|
puts "#{'*'*10} Running #{database} tests"
|
16
16
|
puts
|
data/lib/upsert.rb
CHANGED
@@ -2,12 +2,10 @@ require 'bigdecimal'
|
|
2
2
|
|
3
3
|
require 'upsert/version'
|
4
4
|
require 'upsert/binary'
|
5
|
-
require 'upsert/buffer'
|
6
|
-
require 'upsert/quoter'
|
7
5
|
require 'upsert/row'
|
8
|
-
require 'upsert/
|
9
|
-
require 'upsert/
|
10
|
-
require 'upsert/
|
6
|
+
require 'upsert/mysql2_client'
|
7
|
+
require 'upsert/pg_connection'
|
8
|
+
require 'upsert/sqlite3_database'
|
11
9
|
|
12
10
|
class Upsert
|
13
11
|
class << self
|
@@ -33,9 +31,9 @@ class Upsert
|
|
33
31
|
# end
|
34
32
|
def stream(connection, table_name)
|
35
33
|
upsert = new connection, table_name
|
36
|
-
upsert.
|
34
|
+
upsert.async!
|
37
35
|
yield upsert
|
38
|
-
upsert.
|
36
|
+
upsert.sync!
|
39
37
|
end
|
40
38
|
end
|
41
39
|
|
@@ -43,13 +41,38 @@ class Upsert
|
|
43
41
|
class TooBig < RuntimeError
|
44
42
|
end
|
45
43
|
|
44
|
+
SINGLE_QUOTE = %{'}
|
45
|
+
DOUBLE_QUOTE = %{"}
|
46
|
+
BACKTICK = %{`}
|
47
|
+
E_AND_SINGLE_QUOTE = %{E'}
|
48
|
+
X_AND_SINGLE_QUOTE = %{x'}
|
49
|
+
USEC_SPRINTF = '%06d'
|
50
|
+
ISO8601_DATETIME = '%Y-%m-%d %H:%M:%S'
|
51
|
+
ISO8601_DATE = '%F'
|
52
|
+
|
53
|
+
# @return [Mysql2::Client,Sqlite3::Database,PG::Connection,#raw_connection]
|
54
|
+
attr_reader :connection
|
55
|
+
|
56
|
+
# @return [String,Symbol]
|
57
|
+
attr_reader :table_name
|
58
|
+
|
46
59
|
# @private
|
47
|
-
attr_reader :
|
60
|
+
attr_reader :rows
|
48
61
|
|
49
62
|
# @param [Mysql2::Client,Sqlite3::Database,PG::Connection,#raw_connection] connection A supported database connection.
|
50
63
|
# @param [String,Symbol] table_name The name of the table into which you will be upserting.
|
51
64
|
def initialize(connection, table_name)
|
52
|
-
@
|
65
|
+
@table_name = table_name
|
66
|
+
@rows = []
|
67
|
+
|
68
|
+
@connection = if connection.respond_to?(:raw_connection)
|
69
|
+
# deal with ActiveRecord::Base.connection or ActiveRecord::Base.connection_pool.checkout
|
70
|
+
connection.raw_connection
|
71
|
+
else
|
72
|
+
connection
|
73
|
+
end
|
74
|
+
|
75
|
+
extend Upsert.const_get(@connection.class.name.gsub(/\W+/, '_'))
|
53
76
|
end
|
54
77
|
|
55
78
|
# Upsert a row given a selector and a document.
|
@@ -68,7 +91,69 @@ class Upsert
|
|
68
91
|
# upsert.row({:name => 'Jerry'}, :breed => 'beagle')
|
69
92
|
# upsert.row({:name => 'Pierre'}, :breed => 'tabby')
|
70
93
|
def row(selector, document)
|
71
|
-
|
94
|
+
rows << Row.new(self, selector, document)
|
95
|
+
if sql = chunk
|
96
|
+
execute sql
|
97
|
+
end
|
72
98
|
nil
|
73
99
|
end
|
100
|
+
|
101
|
+
# @private
|
102
|
+
def async?
|
103
|
+
!!@async
|
104
|
+
end
|
105
|
+
|
106
|
+
# @private
|
107
|
+
def async!
|
108
|
+
@async = true
|
109
|
+
end
|
110
|
+
|
111
|
+
# @private
|
112
|
+
def sync!
|
113
|
+
@async = false
|
114
|
+
while sql = chunk
|
115
|
+
execute sql
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
# @private
|
120
|
+
def quote_value(v)
|
121
|
+
case v
|
122
|
+
when NilClass
|
123
|
+
'NULL'
|
124
|
+
when Upsert::Binary
|
125
|
+
quote_binary v # must be defined by base
|
126
|
+
when String
|
127
|
+
quote_string v # must be defined by base
|
128
|
+
when TrueClass, FalseClass
|
129
|
+
quote_boolean v
|
130
|
+
when BigDecimal
|
131
|
+
quote_big_decimal v
|
132
|
+
when Numeric
|
133
|
+
v
|
134
|
+
when Symbol
|
135
|
+
quote_string v.to_s
|
136
|
+
when Time, DateTime
|
137
|
+
quote_time v # must be defined by base
|
138
|
+
when Date
|
139
|
+
quote_string v.strftime(ISO8601_DATE)
|
140
|
+
else
|
141
|
+
raise "not sure how to quote #{v.class}: #{v.inspect}"
|
142
|
+
end
|
143
|
+
end
|
144
|
+
|
145
|
+
# @private
|
146
|
+
def quote_idents(idents)
|
147
|
+
idents.map { |k| quote_ident(k) }.join(',') # must be defined by base
|
148
|
+
end
|
149
|
+
|
150
|
+
# @private
|
151
|
+
def quote_values(values)
|
152
|
+
values.map { |v| quote_value(v) }.join(',')
|
153
|
+
end
|
154
|
+
|
155
|
+
# @private
|
156
|
+
def quote_pairs(pairs)
|
157
|
+
pairs.map { |k, v| [quote_ident(k),quote_value(v)].join('=') }.join(',')
|
158
|
+
end
|
74
159
|
end
|
@@ -0,0 +1,12 @@
|
|
1
|
+
class Upsert
|
2
|
+
module ActiveRecordUpsert
|
3
|
+
def upsert(selector, document)
|
4
|
+
ActiveRecord::Base.connection_pool.with_connection do |c|
|
5
|
+
upsert = Upsert.new c, table_name
|
6
|
+
upsert.row selector, document
|
7
|
+
end
|
8
|
+
end
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
ActiveRecord::Base.extend Upsert::ActiveRecordUpsert
|
@@ -0,0 +1,160 @@
|
|
1
|
+
class Upsert
|
2
|
+
# @private
|
3
|
+
module Mysql2_Client
|
4
|
+
def chunk
|
5
|
+
return if rows.empty?
|
6
|
+
all = rows.length
|
7
|
+
take = all
|
8
|
+
while take > 1 and probably_oversize?(take)
|
9
|
+
take -= 1
|
10
|
+
end
|
11
|
+
if async? and take == all
|
12
|
+
return
|
13
|
+
end
|
14
|
+
while take > 1 and oversize?(take)
|
15
|
+
$stderr.puts " Length prediction via sampling failed, shrinking" if ENV['UPSERT_DEBUG'] == 'true'
|
16
|
+
take -= 1
|
17
|
+
end
|
18
|
+
chunk = sql take
|
19
|
+
while take > 1 and chunk.bytesize > max_sql_bytesize
|
20
|
+
$stderr.puts " Supposedly exact bytesize guess failed, shrinking" if ENV['UPSERT_DEBUG'] == 'true'
|
21
|
+
take -= 1
|
22
|
+
chunk = sql take
|
23
|
+
end
|
24
|
+
if chunk.bytesize > max_sql_bytesize
|
25
|
+
raise TooBig
|
26
|
+
end
|
27
|
+
$stderr.puts " Chunk (#{take}/#{chunk.bytesize}) was #{(chunk.bytesize / max_sql_bytesize.to_f * 100).round}% of the max" if ENV['UPSERT_DEBUG'] == 'true'
|
28
|
+
@rows = rows.drop(take)
|
29
|
+
chunk
|
30
|
+
end
|
31
|
+
|
32
|
+
def execute(sql)
|
33
|
+
connection.query sql
|
34
|
+
end
|
35
|
+
|
36
|
+
def probably_oversize?(take)
|
37
|
+
estimate_sql_bytesize(take) > max_sql_bytesize
|
38
|
+
end
|
39
|
+
|
40
|
+
def oversize?(take)
|
41
|
+
sql_bytesize(take) > max_sql_bytesize
|
42
|
+
end
|
43
|
+
|
44
|
+
def columns
|
45
|
+
@columns ||= rows.first.columns
|
46
|
+
end
|
47
|
+
|
48
|
+
def insert_part
|
49
|
+
@insert_part ||= %{INSERT INTO "#{table_name}" (#{quote_idents(columns)}) VALUES }
|
50
|
+
end
|
51
|
+
|
52
|
+
def update_part
|
53
|
+
@update_part ||= begin
|
54
|
+
updaters = columns.map do |k|
|
55
|
+
qk = quote_ident k
|
56
|
+
[ qk, "VALUES(#{qk})" ].join('=')
|
57
|
+
end.join(',')
|
58
|
+
%{ ON DUPLICATE KEY UPDATE #{updaters}}
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
# where 2 is the parens
|
63
|
+
def static_sql_bytesize
|
64
|
+
@static_sql_bytesize ||= insert_part.bytesize + update_part.bytesize + 2
|
65
|
+
end
|
66
|
+
|
67
|
+
# where 3 is parens and comma
|
68
|
+
def variable_sql_bytesize(take)
|
69
|
+
rows.first(take).inject(0) { |sum, row| sum + row.values_sql_bytesize + 3 }
|
70
|
+
end
|
71
|
+
|
72
|
+
def estimate_variable_sql_bytesize(take)
|
73
|
+
p = (take / 10.0).ceil
|
74
|
+
10.0 * rows.sample(p).inject(0) { |sum, row| sum + row.values_sql_bytesize + 3 }
|
75
|
+
end
|
76
|
+
|
77
|
+
def sql_bytesize(take)
|
78
|
+
static_sql_bytesize + variable_sql_bytesize(take)
|
79
|
+
end
|
80
|
+
|
81
|
+
def estimate_sql_bytesize(take)
|
82
|
+
static_sql_bytesize + estimate_variable_sql_bytesize(take)
|
83
|
+
end
|
84
|
+
|
85
|
+
def sql(take)
|
86
|
+
all_value_sql = rows.first(take).map { |row| row.values_sql }
|
87
|
+
[ insert_part, '(', all_value_sql.join('),('), ')', update_part ].join
|
88
|
+
end
|
89
|
+
|
90
|
+
# since setting an option like :as => :hash actually persists that option to the client, don't pass any options
|
91
|
+
def max_sql_bytesize
|
92
|
+
@max_sql_bytesize ||= begin
|
93
|
+
case (row = connection.query("SHOW VARIABLES LIKE 'max_allowed_packet'").first)
|
94
|
+
when Array
|
95
|
+
row[1]
|
96
|
+
when Hash
|
97
|
+
row['Value']
|
98
|
+
else
|
99
|
+
raise "Don't know what to do if connection.query returns a #{row.class}"
|
100
|
+
end.to_i
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
def quoted_value_bytesize(v)
|
105
|
+
case v
|
106
|
+
when NilClass
|
107
|
+
4
|
108
|
+
when TrueClass
|
109
|
+
4
|
110
|
+
when FalseClass
|
111
|
+
5
|
112
|
+
when BigDecimal
|
113
|
+
v.to_s('F').bytesize
|
114
|
+
when Upsert::Binary
|
115
|
+
v.bytesize * 2 + 3
|
116
|
+
when Numeric
|
117
|
+
v.to_s.bytesize
|
118
|
+
when String
|
119
|
+
v.bytesize + 2
|
120
|
+
when Symbol
|
121
|
+
v.to_s.bytesize + 2
|
122
|
+
when Time, DateTime
|
123
|
+
24 + 2
|
124
|
+
when Date
|
125
|
+
10 + 2
|
126
|
+
else
|
127
|
+
raise "not sure how to get quoted length of #{v.class}: #{v.inspect}"
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
def quote_boolean(v)
|
132
|
+
v ? 'TRUE' : 'FALSE'
|
133
|
+
end
|
134
|
+
|
135
|
+
def quote_string(v)
|
136
|
+
SINGLE_QUOTE + connection.escape(v) + SINGLE_QUOTE
|
137
|
+
end
|
138
|
+
|
139
|
+
# This doubles the size of the representation.
|
140
|
+
def quote_binary(v)
|
141
|
+
X_AND_SINGLE_QUOTE + v.unpack("H*")[0] + SINGLE_QUOTE
|
142
|
+
end
|
143
|
+
|
144
|
+
# put raw binary straight into sql
|
145
|
+
# might work if we could get the encoding issues fixed when joining together the values for the sql
|
146
|
+
# alias_method :quote_binary, :quote_string
|
147
|
+
|
148
|
+
def quote_time(v)
|
149
|
+
quote_string v.strftime(ISO8601_DATETIME)
|
150
|
+
end
|
151
|
+
|
152
|
+
def quote_ident(k)
|
153
|
+
BACKTICK + connection.escape(k.to_s) + BACKTICK
|
154
|
+
end
|
155
|
+
|
156
|
+
def quote_big_decimal(v)
|
157
|
+
v.to_s('F')
|
158
|
+
end
|
159
|
+
end
|
160
|
+
end
|
@@ -0,0 +1,84 @@
|
|
1
|
+
require 'upsert/pg_connection/column_definition'
|
2
|
+
|
3
|
+
class Upsert
|
4
|
+
# @private
|
5
|
+
module PG_Connection
|
6
|
+
|
7
|
+
attr_reader :merge_function
|
8
|
+
|
9
|
+
def chunk
|
10
|
+
return if rows.empty?
|
11
|
+
row = rows.shift
|
12
|
+
unless merge_function
|
13
|
+
create_merge_function row
|
14
|
+
end
|
15
|
+
hsh = row.to_hash
|
16
|
+
ordered_args = column_definitions.map do |c|
|
17
|
+
hsh[c.name]
|
18
|
+
end
|
19
|
+
%{SELECT #{merge_function}(#{quote_values(ordered_args)})}
|
20
|
+
end
|
21
|
+
|
22
|
+
def execute(sql)
|
23
|
+
connection.exec sql
|
24
|
+
end
|
25
|
+
|
26
|
+
def quote_string(v)
|
27
|
+
SINGLE_QUOTE + connection.escape_string(v) + SINGLE_QUOTE
|
28
|
+
end
|
29
|
+
|
30
|
+
def quote_binary(v)
|
31
|
+
E_AND_SINGLE_QUOTE + connection.escape_bytea(v) + SINGLE_QUOTE
|
32
|
+
end
|
33
|
+
|
34
|
+
def quote_time(v)
|
35
|
+
quote_string [v.strftime(ISO8601_DATETIME), sprintf(USEC_SPRINTF, v.usec)].join('.')
|
36
|
+
end
|
37
|
+
|
38
|
+
def quote_big_decimal(v)
|
39
|
+
v.to_s('F')
|
40
|
+
end
|
41
|
+
|
42
|
+
def quote_boolean(v)
|
43
|
+
v ? 'TRUE' : 'FALSE'
|
44
|
+
end
|
45
|
+
|
46
|
+
def quote_ident(k)
|
47
|
+
connection.quote_ident k.to_s
|
48
|
+
end
|
49
|
+
|
50
|
+
def column_definitions
|
51
|
+
@column_definitions ||= ColumnDefinition.all(connection, table_name)
|
52
|
+
end
|
53
|
+
|
54
|
+
private
|
55
|
+
|
56
|
+
def create_merge_function(example_row)
|
57
|
+
@merge_function = "pg_temp.merge_#{table_name}_#{Kernel.rand(1e11)}"
|
58
|
+
execute <<-EOS
|
59
|
+
CREATE FUNCTION #{merge_function}(#{column_definitions.map { |c| "#{quote_ident(c.input_name)} #{c.sql_type} DEFAULT #{c.default || 'NULL'}" }.join(',') }) RETURNS VOID AS
|
60
|
+
$$
|
61
|
+
BEGIN
|
62
|
+
LOOP
|
63
|
+
-- first try to update the key
|
64
|
+
UPDATE #{table_name} SET #{column_definitions.map { |c| "#{quote_ident(c.name)} = #{quote_ident(c.input_name)}" }.join(',')} WHERE #{example_row.selector.keys.map { |k| "#{quote_ident(k)} = #{quote_ident([k,'input'].join('_'))}" }.join(' AND ') };
|
65
|
+
IF found THEN
|
66
|
+
RETURN;
|
67
|
+
END IF;
|
68
|
+
-- not there, so try to insert the key
|
69
|
+
-- if someone else inserts the same key concurrently,
|
70
|
+
-- we could get a unique-key failure
|
71
|
+
BEGIN
|
72
|
+
INSERT INTO #{table_name}(#{column_definitions.map { |c| quote_ident(c.name) }.join(',')}) VALUES (#{column_definitions.map { |c| quote_ident(c.input_name) }.join(',')});
|
73
|
+
RETURN;
|
74
|
+
EXCEPTION WHEN unique_violation THEN
|
75
|
+
-- Do nothing, and loop to try the UPDATE again.
|
76
|
+
END;
|
77
|
+
END LOOP;
|
78
|
+
END;
|
79
|
+
$$
|
80
|
+
LANGUAGE plpgsql;
|
81
|
+
EOS
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|