upsert 0.1.2 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +1 -0
- data/README.md +14 -10
- data/Rakefile +1 -1
- data/lib/upsert.rb +95 -10
- data/lib/upsert/active_record_upsert.rb +12 -0
- data/lib/upsert/mysql2_client.rb +160 -0
- data/lib/upsert/pg_connection.rb +84 -0
- data/lib/upsert/pg_connection/column_definition.rb +60 -0
- data/lib/upsert/row.rb +8 -8
- data/lib/upsert/sqlite3_database.rb +39 -0
- data/lib/upsert/version.rb +1 -1
- data/test/misc/get_postgres_reserved_words.rb +12 -0
- data/test/misc/mysql_reserved.txt +226 -0
- data/test/misc/pg_reserved.txt +742 -0
- data/test/shared/multibyte.rb +2 -2
- data/test/shared/reserved_words.rb +41 -0
- data/test/test_active_record_upsert.rb +23 -0
- data/test/test_mysql2.rb +2 -0
- data/test/test_pg.rb +2 -0
- data/test/test_sqlite.rb +14 -11
- metadata +17 -8
- data/lib/upsert/buffer.rb +0 -58
- data/lib/upsert/buffer/mysql2_client.rb +0 -164
- data/lib/upsert/buffer/pg_connection.rb +0 -87
- data/lib/upsert/buffer/pg_connection/column_definition.rb +0 -60
- data/lib/upsert/buffer/sqlite3_database.rb +0 -43
- data/lib/upsert/quoter.rb +0 -43
data/.gitignore
CHANGED
data/README.md
CHANGED
@@ -18,7 +18,9 @@ Let's say you have...
|
|
18
18
|
document = {:breed => 'beagle'}
|
19
19
|
upsert.row selector, document
|
20
20
|
|
21
|
-
###
|
21
|
+
### Streaming upserts (fastest)
|
22
|
+
|
23
|
+
Rows are buffered in memory until it's efficient to send them to the database.
|
22
24
|
|
23
25
|
Upsert.stream(Pet.connection, Pet.table_name) do |upsert|
|
24
26
|
# [...]
|
@@ -28,7 +30,14 @@ Let's say you have...
|
|
28
30
|
# [...]
|
29
31
|
end
|
30
32
|
|
31
|
-
|
33
|
+
### With a helper method
|
34
|
+
|
35
|
+
For bulk upserts, you probably still want to use `Upsert.stream`.
|
36
|
+
|
37
|
+
# be sure to require 'upsert/active_record_upsert' - it's not required by default
|
38
|
+
selector = {:name => 'Jerry'}
|
39
|
+
document = {:breed => 'beagle'}
|
40
|
+
Pet.upsert selector, document
|
32
41
|
|
33
42
|
## Real-world usage
|
34
43
|
|
@@ -54,9 +63,9 @@ Using the [mysql2](https://rubygems.org/gems/mysql2) driver.
|
|
54
63
|
From the tests:
|
55
64
|
|
56
65
|
Upsert was 77% faster than find + new/set/save
|
57
|
-
Upsert was
|
58
|
-
Upsert was
|
59
|
-
Upsert was
|
66
|
+
Upsert was 58% faster than create + rescue/find/update
|
67
|
+
Upsert was 80% faster than find_or_create + update_attributes
|
68
|
+
Upsert was 39% faster than faking upserts with activerecord-import
|
60
69
|
|
61
70
|
#### SQL MERGE trick
|
62
71
|
|
@@ -199,11 +208,6 @@ This, however, only works on MySQL and requires ActiveRecord—and if all yo
|
|
199
208
|
|
200
209
|
The `selector` and `document` arguments are inspired by the upsert functionality of the [mongo-ruby-driver's update method](http://api.mongodb.org/ruby/1.6.4/Mongo/Collection.html#update-instance_method).
|
201
210
|
|
202
|
-
## Wishlist
|
203
|
-
|
204
|
-
1. `Pet.upsert`... duh
|
205
|
-
2. Don't need a separate buffer class... just extend an instance of Upsert with the appropriate database driver module.
|
206
|
-
|
207
211
|
## Copyright
|
208
212
|
|
209
213
|
Copyright 2012 Brighter Planet, Inc.
|
data/Rakefile
CHANGED
@@ -10,7 +10,7 @@ Rake::TestTask.new(:_test) do |test|
|
|
10
10
|
end
|
11
11
|
|
12
12
|
task :test_each_db_adapter do
|
13
|
-
%w{ mysql2 sqlite pg active_record_connection_adapter }.each do |database|
|
13
|
+
%w{ active_record_upsert mysql2 sqlite pg active_record_connection_adapter }.each do |database|
|
14
14
|
puts
|
15
15
|
puts "#{'*'*10} Running #{database} tests"
|
16
16
|
puts
|
data/lib/upsert.rb
CHANGED
@@ -2,12 +2,10 @@ require 'bigdecimal'
|
|
2
2
|
|
3
3
|
require 'upsert/version'
|
4
4
|
require 'upsert/binary'
|
5
|
-
require 'upsert/buffer'
|
6
|
-
require 'upsert/quoter'
|
7
5
|
require 'upsert/row'
|
8
|
-
require 'upsert/
|
9
|
-
require 'upsert/
|
10
|
-
require 'upsert/
|
6
|
+
require 'upsert/mysql2_client'
|
7
|
+
require 'upsert/pg_connection'
|
8
|
+
require 'upsert/sqlite3_database'
|
11
9
|
|
12
10
|
class Upsert
|
13
11
|
class << self
|
@@ -33,9 +31,9 @@ class Upsert
|
|
33
31
|
# end
|
34
32
|
def stream(connection, table_name)
|
35
33
|
upsert = new connection, table_name
|
36
|
-
upsert.
|
34
|
+
upsert.async!
|
37
35
|
yield upsert
|
38
|
-
upsert.
|
36
|
+
upsert.sync!
|
39
37
|
end
|
40
38
|
end
|
41
39
|
|
@@ -43,13 +41,38 @@ class Upsert
|
|
43
41
|
class TooBig < RuntimeError
|
44
42
|
end
|
45
43
|
|
44
|
+
SINGLE_QUOTE = %{'}
|
45
|
+
DOUBLE_QUOTE = %{"}
|
46
|
+
BACKTICK = %{`}
|
47
|
+
E_AND_SINGLE_QUOTE = %{E'}
|
48
|
+
X_AND_SINGLE_QUOTE = %{x'}
|
49
|
+
USEC_SPRINTF = '%06d'
|
50
|
+
ISO8601_DATETIME = '%Y-%m-%d %H:%M:%S'
|
51
|
+
ISO8601_DATE = '%F'
|
52
|
+
|
53
|
+
# @return [Mysql2::Client,Sqlite3::Database,PG::Connection,#raw_connection]
|
54
|
+
attr_reader :connection
|
55
|
+
|
56
|
+
# @return [String,Symbol]
|
57
|
+
attr_reader :table_name
|
58
|
+
|
46
59
|
# @private
|
47
|
-
attr_reader :
|
60
|
+
attr_reader :rows
|
48
61
|
|
49
62
|
# @param [Mysql2::Client,Sqlite3::Database,PG::Connection,#raw_connection] connection A supported database connection.
|
50
63
|
# @param [String,Symbol] table_name The name of the table into which you will be upserting.
|
51
64
|
def initialize(connection, table_name)
|
52
|
-
@
|
65
|
+
@table_name = table_name
|
66
|
+
@rows = []
|
67
|
+
|
68
|
+
@connection = if connection.respond_to?(:raw_connection)
|
69
|
+
# deal with ActiveRecord::Base.connection or ActiveRecord::Base.connection_pool.checkout
|
70
|
+
connection.raw_connection
|
71
|
+
else
|
72
|
+
connection
|
73
|
+
end
|
74
|
+
|
75
|
+
extend Upsert.const_get(@connection.class.name.gsub(/\W+/, '_'))
|
53
76
|
end
|
54
77
|
|
55
78
|
# Upsert a row given a selector and a document.
|
@@ -68,7 +91,69 @@ class Upsert
|
|
68
91
|
# upsert.row({:name => 'Jerry'}, :breed => 'beagle')
|
69
92
|
# upsert.row({:name => 'Pierre'}, :breed => 'tabby')
|
70
93
|
def row(selector, document)
|
71
|
-
|
94
|
+
rows << Row.new(self, selector, document)
|
95
|
+
if sql = chunk
|
96
|
+
execute sql
|
97
|
+
end
|
72
98
|
nil
|
73
99
|
end
|
100
|
+
|
101
|
+
# @private
|
102
|
+
def async?
|
103
|
+
!!@async
|
104
|
+
end
|
105
|
+
|
106
|
+
# @private
|
107
|
+
def async!
|
108
|
+
@async = true
|
109
|
+
end
|
110
|
+
|
111
|
+
# @private
|
112
|
+
def sync!
|
113
|
+
@async = false
|
114
|
+
while sql = chunk
|
115
|
+
execute sql
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
# @private
|
120
|
+
def quote_value(v)
|
121
|
+
case v
|
122
|
+
when NilClass
|
123
|
+
'NULL'
|
124
|
+
when Upsert::Binary
|
125
|
+
quote_binary v # must be defined by base
|
126
|
+
when String
|
127
|
+
quote_string v # must be defined by base
|
128
|
+
when TrueClass, FalseClass
|
129
|
+
quote_boolean v
|
130
|
+
when BigDecimal
|
131
|
+
quote_big_decimal v
|
132
|
+
when Numeric
|
133
|
+
v
|
134
|
+
when Symbol
|
135
|
+
quote_string v.to_s
|
136
|
+
when Time, DateTime
|
137
|
+
quote_time v # must be defined by base
|
138
|
+
when Date
|
139
|
+
quote_string v.strftime(ISO8601_DATE)
|
140
|
+
else
|
141
|
+
raise "not sure how to quote #{v.class}: #{v.inspect}"
|
142
|
+
end
|
143
|
+
end
|
144
|
+
|
145
|
+
# @private
|
146
|
+
def quote_idents(idents)
|
147
|
+
idents.map { |k| quote_ident(k) }.join(',') # must be defined by base
|
148
|
+
end
|
149
|
+
|
150
|
+
# @private
|
151
|
+
def quote_values(values)
|
152
|
+
values.map { |v| quote_value(v) }.join(',')
|
153
|
+
end
|
154
|
+
|
155
|
+
# @private
|
156
|
+
def quote_pairs(pairs)
|
157
|
+
pairs.map { |k, v| [quote_ident(k),quote_value(v)].join('=') }.join(',')
|
158
|
+
end
|
74
159
|
end
|
@@ -0,0 +1,12 @@
|
|
1
|
+
class Upsert
|
2
|
+
module ActiveRecordUpsert
|
3
|
+
def upsert(selector, document)
|
4
|
+
ActiveRecord::Base.connection_pool.with_connection do |c|
|
5
|
+
upsert = Upsert.new c, table_name
|
6
|
+
upsert.row selector, document
|
7
|
+
end
|
8
|
+
end
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
ActiveRecord::Base.extend Upsert::ActiveRecordUpsert
|
@@ -0,0 +1,160 @@
|
|
1
|
+
class Upsert
|
2
|
+
# @private
|
3
|
+
module Mysql2_Client
|
4
|
+
def chunk
|
5
|
+
return if rows.empty?
|
6
|
+
all = rows.length
|
7
|
+
take = all
|
8
|
+
while take > 1 and probably_oversize?(take)
|
9
|
+
take -= 1
|
10
|
+
end
|
11
|
+
if async? and take == all
|
12
|
+
return
|
13
|
+
end
|
14
|
+
while take > 1 and oversize?(take)
|
15
|
+
$stderr.puts " Length prediction via sampling failed, shrinking" if ENV['UPSERT_DEBUG'] == 'true'
|
16
|
+
take -= 1
|
17
|
+
end
|
18
|
+
chunk = sql take
|
19
|
+
while take > 1 and chunk.bytesize > max_sql_bytesize
|
20
|
+
$stderr.puts " Supposedly exact bytesize guess failed, shrinking" if ENV['UPSERT_DEBUG'] == 'true'
|
21
|
+
take -= 1
|
22
|
+
chunk = sql take
|
23
|
+
end
|
24
|
+
if chunk.bytesize > max_sql_bytesize
|
25
|
+
raise TooBig
|
26
|
+
end
|
27
|
+
$stderr.puts " Chunk (#{take}/#{chunk.bytesize}) was #{(chunk.bytesize / max_sql_bytesize.to_f * 100).round}% of the max" if ENV['UPSERT_DEBUG'] == 'true'
|
28
|
+
@rows = rows.drop(take)
|
29
|
+
chunk
|
30
|
+
end
|
31
|
+
|
32
|
+
def execute(sql)
|
33
|
+
connection.query sql
|
34
|
+
end
|
35
|
+
|
36
|
+
def probably_oversize?(take)
|
37
|
+
estimate_sql_bytesize(take) > max_sql_bytesize
|
38
|
+
end
|
39
|
+
|
40
|
+
def oversize?(take)
|
41
|
+
sql_bytesize(take) > max_sql_bytesize
|
42
|
+
end
|
43
|
+
|
44
|
+
def columns
|
45
|
+
@columns ||= rows.first.columns
|
46
|
+
end
|
47
|
+
|
48
|
+
def insert_part
|
49
|
+
@insert_part ||= %{INSERT INTO "#{table_name}" (#{quote_idents(columns)}) VALUES }
|
50
|
+
end
|
51
|
+
|
52
|
+
def update_part
|
53
|
+
@update_part ||= begin
|
54
|
+
updaters = columns.map do |k|
|
55
|
+
qk = quote_ident k
|
56
|
+
[ qk, "VALUES(#{qk})" ].join('=')
|
57
|
+
end.join(',')
|
58
|
+
%{ ON DUPLICATE KEY UPDATE #{updaters}}
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
# where 2 is the parens
|
63
|
+
def static_sql_bytesize
|
64
|
+
@static_sql_bytesize ||= insert_part.bytesize + update_part.bytesize + 2
|
65
|
+
end
|
66
|
+
|
67
|
+
# where 3 is parens and comma
|
68
|
+
def variable_sql_bytesize(take)
|
69
|
+
rows.first(take).inject(0) { |sum, row| sum + row.values_sql_bytesize + 3 }
|
70
|
+
end
|
71
|
+
|
72
|
+
def estimate_variable_sql_bytesize(take)
|
73
|
+
p = (take / 10.0).ceil
|
74
|
+
10.0 * rows.sample(p).inject(0) { |sum, row| sum + row.values_sql_bytesize + 3 }
|
75
|
+
end
|
76
|
+
|
77
|
+
def sql_bytesize(take)
|
78
|
+
static_sql_bytesize + variable_sql_bytesize(take)
|
79
|
+
end
|
80
|
+
|
81
|
+
def estimate_sql_bytesize(take)
|
82
|
+
static_sql_bytesize + estimate_variable_sql_bytesize(take)
|
83
|
+
end
|
84
|
+
|
85
|
+
def sql(take)
|
86
|
+
all_value_sql = rows.first(take).map { |row| row.values_sql }
|
87
|
+
[ insert_part, '(', all_value_sql.join('),('), ')', update_part ].join
|
88
|
+
end
|
89
|
+
|
90
|
+
# since setting an option like :as => :hash actually persists that option to the client, don't pass any options
|
91
|
+
def max_sql_bytesize
|
92
|
+
@max_sql_bytesize ||= begin
|
93
|
+
case (row = connection.query("SHOW VARIABLES LIKE 'max_allowed_packet'").first)
|
94
|
+
when Array
|
95
|
+
row[1]
|
96
|
+
when Hash
|
97
|
+
row['Value']
|
98
|
+
else
|
99
|
+
raise "Don't know what to do if connection.query returns a #{row.class}"
|
100
|
+
end.to_i
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
def quoted_value_bytesize(v)
|
105
|
+
case v
|
106
|
+
when NilClass
|
107
|
+
4
|
108
|
+
when TrueClass
|
109
|
+
4
|
110
|
+
when FalseClass
|
111
|
+
5
|
112
|
+
when BigDecimal
|
113
|
+
v.to_s('F').bytesize
|
114
|
+
when Upsert::Binary
|
115
|
+
v.bytesize * 2 + 3
|
116
|
+
when Numeric
|
117
|
+
v.to_s.bytesize
|
118
|
+
when String
|
119
|
+
v.bytesize + 2
|
120
|
+
when Symbol
|
121
|
+
v.to_s.bytesize + 2
|
122
|
+
when Time, DateTime
|
123
|
+
24 + 2
|
124
|
+
when Date
|
125
|
+
10 + 2
|
126
|
+
else
|
127
|
+
raise "not sure how to get quoted length of #{v.class}: #{v.inspect}"
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
def quote_boolean(v)
|
132
|
+
v ? 'TRUE' : 'FALSE'
|
133
|
+
end
|
134
|
+
|
135
|
+
def quote_string(v)
|
136
|
+
SINGLE_QUOTE + connection.escape(v) + SINGLE_QUOTE
|
137
|
+
end
|
138
|
+
|
139
|
+
# This doubles the size of the representation.
|
140
|
+
def quote_binary(v)
|
141
|
+
X_AND_SINGLE_QUOTE + v.unpack("H*")[0] + SINGLE_QUOTE
|
142
|
+
end
|
143
|
+
|
144
|
+
# put raw binary straight into sql
|
145
|
+
# might work if we could get the encoding issues fixed when joining together the values for the sql
|
146
|
+
# alias_method :quote_binary, :quote_string
|
147
|
+
|
148
|
+
def quote_time(v)
|
149
|
+
quote_string v.strftime(ISO8601_DATETIME)
|
150
|
+
end
|
151
|
+
|
152
|
+
def quote_ident(k)
|
153
|
+
BACKTICK + connection.escape(k.to_s) + BACKTICK
|
154
|
+
end
|
155
|
+
|
156
|
+
def quote_big_decimal(v)
|
157
|
+
v.to_s('F')
|
158
|
+
end
|
159
|
+
end
|
160
|
+
end
|
@@ -0,0 +1,84 @@
|
|
1
|
+
require 'upsert/pg_connection/column_definition'
|
2
|
+
|
3
|
+
class Upsert
|
4
|
+
# @private
|
5
|
+
module PG_Connection
|
6
|
+
|
7
|
+
attr_reader :merge_function
|
8
|
+
|
9
|
+
def chunk
|
10
|
+
return if rows.empty?
|
11
|
+
row = rows.shift
|
12
|
+
unless merge_function
|
13
|
+
create_merge_function row
|
14
|
+
end
|
15
|
+
hsh = row.to_hash
|
16
|
+
ordered_args = column_definitions.map do |c|
|
17
|
+
hsh[c.name]
|
18
|
+
end
|
19
|
+
%{SELECT #{merge_function}(#{quote_values(ordered_args)})}
|
20
|
+
end
|
21
|
+
|
22
|
+
def execute(sql)
|
23
|
+
connection.exec sql
|
24
|
+
end
|
25
|
+
|
26
|
+
def quote_string(v)
|
27
|
+
SINGLE_QUOTE + connection.escape_string(v) + SINGLE_QUOTE
|
28
|
+
end
|
29
|
+
|
30
|
+
def quote_binary(v)
|
31
|
+
E_AND_SINGLE_QUOTE + connection.escape_bytea(v) + SINGLE_QUOTE
|
32
|
+
end
|
33
|
+
|
34
|
+
def quote_time(v)
|
35
|
+
quote_string [v.strftime(ISO8601_DATETIME), sprintf(USEC_SPRINTF, v.usec)].join('.')
|
36
|
+
end
|
37
|
+
|
38
|
+
def quote_big_decimal(v)
|
39
|
+
v.to_s('F')
|
40
|
+
end
|
41
|
+
|
42
|
+
def quote_boolean(v)
|
43
|
+
v ? 'TRUE' : 'FALSE'
|
44
|
+
end
|
45
|
+
|
46
|
+
def quote_ident(k)
|
47
|
+
connection.quote_ident k.to_s
|
48
|
+
end
|
49
|
+
|
50
|
+
def column_definitions
|
51
|
+
@column_definitions ||= ColumnDefinition.all(connection, table_name)
|
52
|
+
end
|
53
|
+
|
54
|
+
private
|
55
|
+
|
56
|
+
def create_merge_function(example_row)
|
57
|
+
@merge_function = "pg_temp.merge_#{table_name}_#{Kernel.rand(1e11)}"
|
58
|
+
execute <<-EOS
|
59
|
+
CREATE FUNCTION #{merge_function}(#{column_definitions.map { |c| "#{quote_ident(c.input_name)} #{c.sql_type} DEFAULT #{c.default || 'NULL'}" }.join(',') }) RETURNS VOID AS
|
60
|
+
$$
|
61
|
+
BEGIN
|
62
|
+
LOOP
|
63
|
+
-- first try to update the key
|
64
|
+
UPDATE #{table_name} SET #{column_definitions.map { |c| "#{quote_ident(c.name)} = #{quote_ident(c.input_name)}" }.join(',')} WHERE #{example_row.selector.keys.map { |k| "#{quote_ident(k)} = #{quote_ident([k,'input'].join('_'))}" }.join(' AND ') };
|
65
|
+
IF found THEN
|
66
|
+
RETURN;
|
67
|
+
END IF;
|
68
|
+
-- not there, so try to insert the key
|
69
|
+
-- if someone else inserts the same key concurrently,
|
70
|
+
-- we could get a unique-key failure
|
71
|
+
BEGIN
|
72
|
+
INSERT INTO #{table_name}(#{column_definitions.map { |c| quote_ident(c.name) }.join(',')}) VALUES (#{column_definitions.map { |c| quote_ident(c.input_name) }.join(',')});
|
73
|
+
RETURN;
|
74
|
+
EXCEPTION WHEN unique_violation THEN
|
75
|
+
-- Do nothing, and loop to try the UPDATE again.
|
76
|
+
END;
|
77
|
+
END LOOP;
|
78
|
+
END;
|
79
|
+
$$
|
80
|
+
LANGUAGE plpgsql;
|
81
|
+
EOS
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|