upsert 0.1.2 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore CHANGED
@@ -1,3 +1,4 @@
1
+ .DS_Store
1
2
  *.gem
2
3
  *.rbc
3
4
  .bundle
data/README.md CHANGED
@@ -18,7 +18,9 @@ Let's say you have...
18
18
  document = {:breed => 'beagle'}
19
19
  upsert.row selector, document
20
20
 
21
- ### Multiple upserts bundled together for speed
21
+ ### Streaming upserts (fastest)
22
+
23
+ Rows are buffered in memory until it's efficient to send them to the database.
22
24
 
23
25
  Upsert.stream(Pet.connection, Pet.table_name) do |upsert|
24
26
  # [...]
@@ -28,7 +30,14 @@ Let's say you have...
28
30
  # [...]
29
31
  end
30
32
 
31
- Rows are buffered in memory until it's efficient to send them to the database.
33
+ ### With a helper method
34
+
35
+ For bulk upserts, you probably still want to use `Upsert.stream`.
36
+
37
+ # be sure to require 'upsert/active_record_upsert' - it's not required by default
38
+ selector = {:name => 'Jerry'}
39
+ document = {:breed => 'beagle'}
40
+ Pet.upsert selector, document
32
41
 
33
42
  ## Real-world usage
34
43
 
@@ -54,9 +63,9 @@ Using the [mysql2](https://rubygems.org/gems/mysql2) driver.
54
63
  From the tests:
55
64
 
56
65
  Upsert was 77% faster than find + new/set/save
57
- Upsert was 84% faster than create + rescue/find/update
58
- Upsert was 82% faster than find_or_create + update_attributes
59
- Upsert was 47% faster than faking upserts with activerecord-import
66
+ Upsert was 58% faster than create + rescue/find/update
67
+ Upsert was 80% faster than find_or_create + update_attributes
68
+ Upsert was 39% faster than faking upserts with activerecord-import
60
69
 
61
70
  #### SQL MERGE trick
62
71
 
@@ -199,11 +208,6 @@ This, however, only works on MySQL and requires ActiveRecord—and if all yo
199
208
 
200
209
  The `selector` and `document` arguments are inspired by the upsert functionality of the [mongo-ruby-driver's update method](http://api.mongodb.org/ruby/1.6.4/Mongo/Collection.html#update-instance_method).
201
210
 
202
- ## Wishlist
203
-
204
- 1. `Pet.upsert`... duh
205
- 2. Don't need a separate buffer class... just extend an instance of Upsert with the appropriate database driver module.
206
-
207
211
  ## Copyright
208
212
 
209
213
  Copyright 2012 Brighter Planet, Inc.
data/Rakefile CHANGED
@@ -10,7 +10,7 @@ Rake::TestTask.new(:_test) do |test|
10
10
  end
11
11
 
12
12
  task :test_each_db_adapter do
13
- %w{ mysql2 sqlite pg active_record_connection_adapter }.each do |database|
13
+ %w{ active_record_upsert mysql2 sqlite pg active_record_connection_adapter }.each do |database|
14
14
  puts
15
15
  puts "#{'*'*10} Running #{database} tests"
16
16
  puts
@@ -2,12 +2,10 @@ require 'bigdecimal'
2
2
 
3
3
  require 'upsert/version'
4
4
  require 'upsert/binary'
5
- require 'upsert/buffer'
6
- require 'upsert/quoter'
7
5
  require 'upsert/row'
8
- require 'upsert/buffer/mysql2_client'
9
- require 'upsert/buffer/pg_connection'
10
- require 'upsert/buffer/sqlite3_database'
6
+ require 'upsert/mysql2_client'
7
+ require 'upsert/pg_connection'
8
+ require 'upsert/sqlite3_database'
11
9
 
12
10
  class Upsert
13
11
  class << self
@@ -33,9 +31,9 @@ class Upsert
33
31
  # end
34
32
  def stream(connection, table_name)
35
33
  upsert = new connection, table_name
36
- upsert.buffer.async!
34
+ upsert.async!
37
35
  yield upsert
38
- upsert.buffer.sync!
36
+ upsert.sync!
39
37
  end
40
38
  end
41
39
 
@@ -43,13 +41,38 @@ class Upsert
43
41
  class TooBig < RuntimeError
44
42
  end
45
43
 
44
+ SINGLE_QUOTE = %{'}
45
+ DOUBLE_QUOTE = %{"}
46
+ BACKTICK = %{`}
47
+ E_AND_SINGLE_QUOTE = %{E'}
48
+ X_AND_SINGLE_QUOTE = %{x'}
49
+ USEC_SPRINTF = '%06d'
50
+ ISO8601_DATETIME = '%Y-%m-%d %H:%M:%S'
51
+ ISO8601_DATE = '%F'
52
+
53
+ # @return [Mysql2::Client,Sqlite3::Database,PG::Connection,#raw_connection]
54
+ attr_reader :connection
55
+
56
+ # @return [String,Symbol]
57
+ attr_reader :table_name
58
+
46
59
  # @private
47
- attr_reader :buffer
60
+ attr_reader :rows
48
61
 
49
62
  # @param [Mysql2::Client,Sqlite3::Database,PG::Connection,#raw_connection] connection A supported database connection.
50
63
  # @param [String,Symbol] table_name The name of the table into which you will be upserting.
51
64
  def initialize(connection, table_name)
52
- @buffer = Buffer.for connection, table_name
65
+ @table_name = table_name
66
+ @rows = []
67
+
68
+ @connection = if connection.respond_to?(:raw_connection)
69
+ # deal with ActiveRecord::Base.connection or ActiveRecord::Base.connection_pool.checkout
70
+ connection.raw_connection
71
+ else
72
+ connection
73
+ end
74
+
75
+ extend Upsert.const_get(@connection.class.name.gsub(/\W+/, '_'))
53
76
  end
54
77
 
55
78
  # Upsert a row given a selector and a document.
@@ -68,7 +91,69 @@ class Upsert
68
91
  # upsert.row({:name => 'Jerry'}, :breed => 'beagle')
69
92
  # upsert.row({:name => 'Pierre'}, :breed => 'tabby')
70
93
  def row(selector, document)
71
- buffer.add selector, document
94
+ rows << Row.new(self, selector, document)
95
+ if sql = chunk
96
+ execute sql
97
+ end
72
98
  nil
73
99
  end
100
+
101
+ # @private
102
+ def async?
103
+ !!@async
104
+ end
105
+
106
+ # @private
107
+ def async!
108
+ @async = true
109
+ end
110
+
111
+ # @private
112
+ def sync!
113
+ @async = false
114
+ while sql = chunk
115
+ execute sql
116
+ end
117
+ end
118
+
119
+ # @private
120
+ def quote_value(v)
121
+ case v
122
+ when NilClass
123
+ 'NULL'
124
+ when Upsert::Binary
125
+ quote_binary v # must be defined by base
126
+ when String
127
+ quote_string v # must be defined by base
128
+ when TrueClass, FalseClass
129
+ quote_boolean v
130
+ when BigDecimal
131
+ quote_big_decimal v
132
+ when Numeric
133
+ v
134
+ when Symbol
135
+ quote_string v.to_s
136
+ when Time, DateTime
137
+ quote_time v # must be defined by base
138
+ when Date
139
+ quote_string v.strftime(ISO8601_DATE)
140
+ else
141
+ raise "not sure how to quote #{v.class}: #{v.inspect}"
142
+ end
143
+ end
144
+
145
+ # @private
146
+ def quote_idents(idents)
147
+ idents.map { |k| quote_ident(k) }.join(',') # must be defined by base
148
+ end
149
+
150
+ # @private
151
+ def quote_values(values)
152
+ values.map { |v| quote_value(v) }.join(',')
153
+ end
154
+
155
+ # @private
156
+ def quote_pairs(pairs)
157
+ pairs.map { |k, v| [quote_ident(k),quote_value(v)].join('=') }.join(',')
158
+ end
74
159
  end
@@ -0,0 +1,12 @@
1
+ class Upsert
2
+ module ActiveRecordUpsert
3
+ def upsert(selector, document)
4
+ ActiveRecord::Base.connection_pool.with_connection do |c|
5
+ upsert = Upsert.new c, table_name
6
+ upsert.row selector, document
7
+ end
8
+ end
9
+ end
10
+ end
11
+
12
+ ActiveRecord::Base.extend Upsert::ActiveRecordUpsert
@@ -0,0 +1,160 @@
1
+ class Upsert
2
+ # @private
3
+ module Mysql2_Client
4
+ def chunk
5
+ return if rows.empty?
6
+ all = rows.length
7
+ take = all
8
+ while take > 1 and probably_oversize?(take)
9
+ take -= 1
10
+ end
11
+ if async? and take == all
12
+ return
13
+ end
14
+ while take > 1 and oversize?(take)
15
+ $stderr.puts " Length prediction via sampling failed, shrinking" if ENV['UPSERT_DEBUG'] == 'true'
16
+ take -= 1
17
+ end
18
+ chunk = sql take
19
+ while take > 1 and chunk.bytesize > max_sql_bytesize
20
+ $stderr.puts " Supposedly exact bytesize guess failed, shrinking" if ENV['UPSERT_DEBUG'] == 'true'
21
+ take -= 1
22
+ chunk = sql take
23
+ end
24
+ if chunk.bytesize > max_sql_bytesize
25
+ raise TooBig
26
+ end
27
+ $stderr.puts " Chunk (#{take}/#{chunk.bytesize}) was #{(chunk.bytesize / max_sql_bytesize.to_f * 100).round}% of the max" if ENV['UPSERT_DEBUG'] == 'true'
28
+ @rows = rows.drop(take)
29
+ chunk
30
+ end
31
+
32
+ def execute(sql)
33
+ connection.query sql
34
+ end
35
+
36
+ def probably_oversize?(take)
37
+ estimate_sql_bytesize(take) > max_sql_bytesize
38
+ end
39
+
40
+ def oversize?(take)
41
+ sql_bytesize(take) > max_sql_bytesize
42
+ end
43
+
44
+ def columns
45
+ @columns ||= rows.first.columns
46
+ end
47
+
48
+ def insert_part
49
+ @insert_part ||= %{INSERT INTO "#{table_name}" (#{quote_idents(columns)}) VALUES }
50
+ end
51
+
52
+ def update_part
53
+ @update_part ||= begin
54
+ updaters = columns.map do |k|
55
+ qk = quote_ident k
56
+ [ qk, "VALUES(#{qk})" ].join('=')
57
+ end.join(',')
58
+ %{ ON DUPLICATE KEY UPDATE #{updaters}}
59
+ end
60
+ end
61
+
62
+ # where 2 is the parens
63
+ def static_sql_bytesize
64
+ @static_sql_bytesize ||= insert_part.bytesize + update_part.bytesize + 2
65
+ end
66
+
67
+ # where 3 is parens and comma
68
+ def variable_sql_bytesize(take)
69
+ rows.first(take).inject(0) { |sum, row| sum + row.values_sql_bytesize + 3 }
70
+ end
71
+
72
+ def estimate_variable_sql_bytesize(take)
73
+ p = (take / 10.0).ceil
74
+ 10.0 * rows.sample(p).inject(0) { |sum, row| sum + row.values_sql_bytesize + 3 }
75
+ end
76
+
77
+ def sql_bytesize(take)
78
+ static_sql_bytesize + variable_sql_bytesize(take)
79
+ end
80
+
81
+ def estimate_sql_bytesize(take)
82
+ static_sql_bytesize + estimate_variable_sql_bytesize(take)
83
+ end
84
+
85
+ def sql(take)
86
+ all_value_sql = rows.first(take).map { |row| row.values_sql }
87
+ [ insert_part, '(', all_value_sql.join('),('), ')', update_part ].join
88
+ end
89
+
90
+ # since setting an option like :as => :hash actually persists that option to the client, don't pass any options
91
+ def max_sql_bytesize
92
+ @max_sql_bytesize ||= begin
93
+ case (row = connection.query("SHOW VARIABLES LIKE 'max_allowed_packet'").first)
94
+ when Array
95
+ row[1]
96
+ when Hash
97
+ row['Value']
98
+ else
99
+ raise "Don't know what to do if connection.query returns a #{row.class}"
100
+ end.to_i
101
+ end
102
+ end
103
+
104
+ def quoted_value_bytesize(v)
105
+ case v
106
+ when NilClass
107
+ 4
108
+ when TrueClass
109
+ 4
110
+ when FalseClass
111
+ 5
112
+ when BigDecimal
113
+ v.to_s('F').bytesize
114
+ when Upsert::Binary
115
+ v.bytesize * 2 + 3
116
+ when Numeric
117
+ v.to_s.bytesize
118
+ when String
119
+ v.bytesize + 2
120
+ when Symbol
121
+ v.to_s.bytesize + 2
122
+ when Time, DateTime
123
+ 24 + 2
124
+ when Date
125
+ 10 + 2
126
+ else
127
+ raise "not sure how to get quoted length of #{v.class}: #{v.inspect}"
128
+ end
129
+ end
130
+
131
+ def quote_boolean(v)
132
+ v ? 'TRUE' : 'FALSE'
133
+ end
134
+
135
+ def quote_string(v)
136
+ SINGLE_QUOTE + connection.escape(v) + SINGLE_QUOTE
137
+ end
138
+
139
+ # This doubles the size of the representation.
140
+ def quote_binary(v)
141
+ X_AND_SINGLE_QUOTE + v.unpack("H*")[0] + SINGLE_QUOTE
142
+ end
143
+
144
+ # put raw binary straight into sql
145
+ # might work if we could get the encoding issues fixed when joining together the values for the sql
146
+ # alias_method :quote_binary, :quote_string
147
+
148
+ def quote_time(v)
149
+ quote_string v.strftime(ISO8601_DATETIME)
150
+ end
151
+
152
+ def quote_ident(k)
153
+ BACKTICK + connection.escape(k.to_s) + BACKTICK
154
+ end
155
+
156
+ def quote_big_decimal(v)
157
+ v.to_s('F')
158
+ end
159
+ end
160
+ end
@@ -0,0 +1,84 @@
1
+ require 'upsert/pg_connection/column_definition'
2
+
3
+ class Upsert
4
+ # @private
5
+ module PG_Connection
6
+
7
+ attr_reader :merge_function
8
+
9
+ def chunk
10
+ return if rows.empty?
11
+ row = rows.shift
12
+ unless merge_function
13
+ create_merge_function row
14
+ end
15
+ hsh = row.to_hash
16
+ ordered_args = column_definitions.map do |c|
17
+ hsh[c.name]
18
+ end
19
+ %{SELECT #{merge_function}(#{quote_values(ordered_args)})}
20
+ end
21
+
22
+ def execute(sql)
23
+ connection.exec sql
24
+ end
25
+
26
+ def quote_string(v)
27
+ SINGLE_QUOTE + connection.escape_string(v) + SINGLE_QUOTE
28
+ end
29
+
30
+ def quote_binary(v)
31
+ E_AND_SINGLE_QUOTE + connection.escape_bytea(v) + SINGLE_QUOTE
32
+ end
33
+
34
+ def quote_time(v)
35
+ quote_string [v.strftime(ISO8601_DATETIME), sprintf(USEC_SPRINTF, v.usec)].join('.')
36
+ end
37
+
38
+ def quote_big_decimal(v)
39
+ v.to_s('F')
40
+ end
41
+
42
+ def quote_boolean(v)
43
+ v ? 'TRUE' : 'FALSE'
44
+ end
45
+
46
+ def quote_ident(k)
47
+ connection.quote_ident k.to_s
48
+ end
49
+
50
+ def column_definitions
51
+ @column_definitions ||= ColumnDefinition.all(connection, table_name)
52
+ end
53
+
54
+ private
55
+
56
+ def create_merge_function(example_row)
57
+ @merge_function = "pg_temp.merge_#{table_name}_#{Kernel.rand(1e11)}"
58
+ execute <<-EOS
59
+ CREATE FUNCTION #{merge_function}(#{column_definitions.map { |c| "#{quote_ident(c.input_name)} #{c.sql_type} DEFAULT #{c.default || 'NULL'}" }.join(',') }) RETURNS VOID AS
60
+ $$
61
+ BEGIN
62
+ LOOP
63
+ -- first try to update the key
64
+ UPDATE #{table_name} SET #{column_definitions.map { |c| "#{quote_ident(c.name)} = #{quote_ident(c.input_name)}" }.join(',')} WHERE #{example_row.selector.keys.map { |k| "#{quote_ident(k)} = #{quote_ident([k,'input'].join('_'))}" }.join(' AND ') };
65
+ IF found THEN
66
+ RETURN;
67
+ END IF;
68
+ -- not there, so try to insert the key
69
+ -- if someone else inserts the same key concurrently,
70
+ -- we could get a unique-key failure
71
+ BEGIN
72
+ INSERT INTO #{table_name}(#{column_definitions.map { |c| quote_ident(c.name) }.join(',')}) VALUES (#{column_definitions.map { |c| quote_ident(c.input_name) }.join(',')});
73
+ RETURN;
74
+ EXCEPTION WHEN unique_violation THEN
75
+ -- Do nothing, and loop to try the UPDATE again.
76
+ END;
77
+ END LOOP;
78
+ END;
79
+ $$
80
+ LANGUAGE plpgsql;
81
+ EOS
82
+ end
83
+ end
84
+ end