upsert 0.1.2 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore CHANGED
@@ -1,3 +1,4 @@
1
+ .DS_Store
1
2
  *.gem
2
3
  *.rbc
3
4
  .bundle
data/README.md CHANGED
@@ -18,7 +18,9 @@ Let's say you have...
18
18
  document = {:breed => 'beagle'}
19
19
  upsert.row selector, document
20
20
 
21
- ### Multiple upserts bundled together for speed
21
+ ### Streaming upserts (fastest)
22
+
23
+ Rows are buffered in memory until it's efficient to send them to the database.
22
24
 
23
25
  Upsert.stream(Pet.connection, Pet.table_name) do |upsert|
24
26
  # [...]
@@ -28,7 +30,14 @@ Let's say you have...
28
30
  # [...]
29
31
  end
30
32
 
31
- Rows are buffered in memory until it's efficient to send them to the database.
33
+ ### With a helper method
34
+
35
+ For bulk upserts, you probably still want to use `Upsert.stream`.
36
+
37
+ # be sure to require 'upsert/active_record_upsert' - it's not required by default
38
+ selector = {:name => 'Jerry'}
39
+ document = {:breed => 'beagle'}
40
+ Pet.upsert selector, document
32
41
 
33
42
  ## Real-world usage
34
43
 
@@ -54,9 +63,9 @@ Using the [mysql2](https://rubygems.org/gems/mysql2) driver.
54
63
  From the tests:
55
64
 
56
65
  Upsert was 77% faster than find + new/set/save
57
- Upsert was 84% faster than create + rescue/find/update
58
- Upsert was 82% faster than find_or_create + update_attributes
59
- Upsert was 47% faster than faking upserts with activerecord-import
66
+ Upsert was 58% faster than create + rescue/find/update
67
+ Upsert was 80% faster than find_or_create + update_attributes
68
+ Upsert was 39% faster than faking upserts with activerecord-import
60
69
 
61
70
  #### SQL MERGE trick
62
71
 
@@ -199,11 +208,6 @@ This, however, only works on MySQL and requires ActiveRecord—and if all yo
199
208
 
200
209
  The `selector` and `document` arguments are inspired by the upsert functionality of the [mongo-ruby-driver's update method](http://api.mongodb.org/ruby/1.6.4/Mongo/Collection.html#update-instance_method).
201
210
 
202
- ## Wishlist
203
-
204
- 1. `Pet.upsert`... duh
205
- 2. Don't need a separate buffer class... just extend an instance of Upsert with the appropriate database driver module.
206
-
207
211
  ## Copyright
208
212
 
209
213
  Copyright 2012 Brighter Planet, Inc.
data/Rakefile CHANGED
@@ -10,7 +10,7 @@ Rake::TestTask.new(:_test) do |test|
10
10
  end
11
11
 
12
12
  task :test_each_db_adapter do
13
- %w{ mysql2 sqlite pg active_record_connection_adapter }.each do |database|
13
+ %w{ active_record_upsert mysql2 sqlite pg active_record_connection_adapter }.each do |database|
14
14
  puts
15
15
  puts "#{'*'*10} Running #{database} tests"
16
16
  puts
@@ -2,12 +2,10 @@ require 'bigdecimal'
2
2
 
3
3
  require 'upsert/version'
4
4
  require 'upsert/binary'
5
- require 'upsert/buffer'
6
- require 'upsert/quoter'
7
5
  require 'upsert/row'
8
- require 'upsert/buffer/mysql2_client'
9
- require 'upsert/buffer/pg_connection'
10
- require 'upsert/buffer/sqlite3_database'
6
+ require 'upsert/mysql2_client'
7
+ require 'upsert/pg_connection'
8
+ require 'upsert/sqlite3_database'
11
9
 
12
10
  class Upsert
13
11
  class << self
@@ -33,9 +31,9 @@ class Upsert
33
31
  # end
34
32
  def stream(connection, table_name)
35
33
  upsert = new connection, table_name
36
- upsert.buffer.async!
34
+ upsert.async!
37
35
  yield upsert
38
- upsert.buffer.sync!
36
+ upsert.sync!
39
37
  end
40
38
  end
41
39
 
@@ -43,13 +41,38 @@ class Upsert
43
41
  class TooBig < RuntimeError
44
42
  end
45
43
 
44
+ SINGLE_QUOTE = %{'}
45
+ DOUBLE_QUOTE = %{"}
46
+ BACKTICK = %{`}
47
+ E_AND_SINGLE_QUOTE = %{E'}
48
+ X_AND_SINGLE_QUOTE = %{x'}
49
+ USEC_SPRINTF = '%06d'
50
+ ISO8601_DATETIME = '%Y-%m-%d %H:%M:%S'
51
+ ISO8601_DATE = '%F'
52
+
53
+ # @return [Mysql2::Client,Sqlite3::Database,PG::Connection,#raw_connection]
54
+ attr_reader :connection
55
+
56
+ # @return [String,Symbol]
57
+ attr_reader :table_name
58
+
46
59
  # @private
47
- attr_reader :buffer
60
+ attr_reader :rows
48
61
 
49
62
  # @param [Mysql2::Client,Sqlite3::Database,PG::Connection,#raw_connection] connection A supported database connection.
50
63
  # @param [String,Symbol] table_name The name of the table into which you will be upserting.
51
64
  def initialize(connection, table_name)
52
- @buffer = Buffer.for connection, table_name
65
+ @table_name = table_name
66
+ @rows = []
67
+
68
+ @connection = if connection.respond_to?(:raw_connection)
69
+ # deal with ActiveRecord::Base.connection or ActiveRecord::Base.connection_pool.checkout
70
+ connection.raw_connection
71
+ else
72
+ connection
73
+ end
74
+
75
+ extend Upsert.const_get(@connection.class.name.gsub(/\W+/, '_'))
53
76
  end
54
77
 
55
78
  # Upsert a row given a selector and a document.
@@ -68,7 +91,69 @@ class Upsert
68
91
  # upsert.row({:name => 'Jerry'}, :breed => 'beagle')
69
92
  # upsert.row({:name => 'Pierre'}, :breed => 'tabby')
70
93
  def row(selector, document)
71
- buffer.add selector, document
94
+ rows << Row.new(self, selector, document)
95
+ if sql = chunk
96
+ execute sql
97
+ end
72
98
  nil
73
99
  end
100
+
101
+ # @private
102
+ def async?
103
+ !!@async
104
+ end
105
+
106
+ # @private
107
+ def async!
108
+ @async = true
109
+ end
110
+
111
+ # @private
112
+ def sync!
113
+ @async = false
114
+ while sql = chunk
115
+ execute sql
116
+ end
117
+ end
118
+
119
+ # @private
120
+ def quote_value(v)
121
+ case v
122
+ when NilClass
123
+ 'NULL'
124
+ when Upsert::Binary
125
+ quote_binary v # must be defined by base
126
+ when String
127
+ quote_string v # must be defined by base
128
+ when TrueClass, FalseClass
129
+ quote_boolean v
130
+ when BigDecimal
131
+ quote_big_decimal v
132
+ when Numeric
133
+ v
134
+ when Symbol
135
+ quote_string v.to_s
136
+ when Time, DateTime
137
+ quote_time v # must be defined by base
138
+ when Date
139
+ quote_string v.strftime(ISO8601_DATE)
140
+ else
141
+ raise "not sure how to quote #{v.class}: #{v.inspect}"
142
+ end
143
+ end
144
+
145
+ # @private
146
+ def quote_idents(idents)
147
+ idents.map { |k| quote_ident(k) }.join(',') # must be defined by base
148
+ end
149
+
150
+ # @private
151
+ def quote_values(values)
152
+ values.map { |v| quote_value(v) }.join(',')
153
+ end
154
+
155
+ # @private
156
+ def quote_pairs(pairs)
157
+ pairs.map { |k, v| [quote_ident(k),quote_value(v)].join('=') }.join(',')
158
+ end
74
159
  end
@@ -0,0 +1,12 @@
1
+ class Upsert
2
+ module ActiveRecordUpsert
3
+ def upsert(selector, document)
4
+ ActiveRecord::Base.connection_pool.with_connection do |c|
5
+ upsert = Upsert.new c, table_name
6
+ upsert.row selector, document
7
+ end
8
+ end
9
+ end
10
+ end
11
+
12
+ ActiveRecord::Base.extend Upsert::ActiveRecordUpsert
@@ -0,0 +1,160 @@
1
+ class Upsert
2
+ # @private
3
+ module Mysql2_Client
4
+ def chunk
5
+ return if rows.empty?
6
+ all = rows.length
7
+ take = all
8
+ while take > 1 and probably_oversize?(take)
9
+ take -= 1
10
+ end
11
+ if async? and take == all
12
+ return
13
+ end
14
+ while take > 1 and oversize?(take)
15
+ $stderr.puts " Length prediction via sampling failed, shrinking" if ENV['UPSERT_DEBUG'] == 'true'
16
+ take -= 1
17
+ end
18
+ chunk = sql take
19
+ while take > 1 and chunk.bytesize > max_sql_bytesize
20
+ $stderr.puts " Supposedly exact bytesize guess failed, shrinking" if ENV['UPSERT_DEBUG'] == 'true'
21
+ take -= 1
22
+ chunk = sql take
23
+ end
24
+ if chunk.bytesize > max_sql_bytesize
25
+ raise TooBig
26
+ end
27
+ $stderr.puts " Chunk (#{take}/#{chunk.bytesize}) was #{(chunk.bytesize / max_sql_bytesize.to_f * 100).round}% of the max" if ENV['UPSERT_DEBUG'] == 'true'
28
+ @rows = rows.drop(take)
29
+ chunk
30
+ end
31
+
32
+ def execute(sql)
33
+ connection.query sql
34
+ end
35
+
36
+ def probably_oversize?(take)
37
+ estimate_sql_bytesize(take) > max_sql_bytesize
38
+ end
39
+
40
+ def oversize?(take)
41
+ sql_bytesize(take) > max_sql_bytesize
42
+ end
43
+
44
+ def columns
45
+ @columns ||= rows.first.columns
46
+ end
47
+
48
+ def insert_part
49
+ @insert_part ||= %{INSERT INTO "#{table_name}" (#{quote_idents(columns)}) VALUES }
50
+ end
51
+
52
+ def update_part
53
+ @update_part ||= begin
54
+ updaters = columns.map do |k|
55
+ qk = quote_ident k
56
+ [ qk, "VALUES(#{qk})" ].join('=')
57
+ end.join(',')
58
+ %{ ON DUPLICATE KEY UPDATE #{updaters}}
59
+ end
60
+ end
61
+
62
+ # where 2 is the parens
63
+ def static_sql_bytesize
64
+ @static_sql_bytesize ||= insert_part.bytesize + update_part.bytesize + 2
65
+ end
66
+
67
+ # where 3 is parens and comma
68
+ def variable_sql_bytesize(take)
69
+ rows.first(take).inject(0) { |sum, row| sum + row.values_sql_bytesize + 3 }
70
+ end
71
+
72
+ def estimate_variable_sql_bytesize(take)
73
+ p = (take / 10.0).ceil
74
+ 10.0 * rows.sample(p).inject(0) { |sum, row| sum + row.values_sql_bytesize + 3 }
75
+ end
76
+
77
+ def sql_bytesize(take)
78
+ static_sql_bytesize + variable_sql_bytesize(take)
79
+ end
80
+
81
+ def estimate_sql_bytesize(take)
82
+ static_sql_bytesize + estimate_variable_sql_bytesize(take)
83
+ end
84
+
85
+ def sql(take)
86
+ all_value_sql = rows.first(take).map { |row| row.values_sql }
87
+ [ insert_part, '(', all_value_sql.join('),('), ')', update_part ].join
88
+ end
89
+
90
+ # since setting an option like :as => :hash actually persists that option to the client, don't pass any options
91
+ def max_sql_bytesize
92
+ @max_sql_bytesize ||= begin
93
+ case (row = connection.query("SHOW VARIABLES LIKE 'max_allowed_packet'").first)
94
+ when Array
95
+ row[1]
96
+ when Hash
97
+ row['Value']
98
+ else
99
+ raise "Don't know what to do if connection.query returns a #{row.class}"
100
+ end.to_i
101
+ end
102
+ end
103
+
104
+ def quoted_value_bytesize(v)
105
+ case v
106
+ when NilClass
107
+ 4
108
+ when TrueClass
109
+ 4
110
+ when FalseClass
111
+ 5
112
+ when BigDecimal
113
+ v.to_s('F').bytesize
114
+ when Upsert::Binary
115
+ v.bytesize * 2 + 3
116
+ when Numeric
117
+ v.to_s.bytesize
118
+ when String
119
+ v.bytesize + 2
120
+ when Symbol
121
+ v.to_s.bytesize + 2
122
+ when Time, DateTime
123
+ 24 + 2
124
+ when Date
125
+ 10 + 2
126
+ else
127
+ raise "not sure how to get quoted length of #{v.class}: #{v.inspect}"
128
+ end
129
+ end
130
+
131
+ def quote_boolean(v)
132
+ v ? 'TRUE' : 'FALSE'
133
+ end
134
+
135
+ def quote_string(v)
136
+ SINGLE_QUOTE + connection.escape(v) + SINGLE_QUOTE
137
+ end
138
+
139
+ # This doubles the size of the representation.
140
+ def quote_binary(v)
141
+ X_AND_SINGLE_QUOTE + v.unpack("H*")[0] + SINGLE_QUOTE
142
+ end
143
+
144
+ # put raw binary straight into sql
145
+ # might work if we could get the encoding issues fixed when joining together the values for the sql
146
+ # alias_method :quote_binary, :quote_string
147
+
148
+ def quote_time(v)
149
+ quote_string v.strftime(ISO8601_DATETIME)
150
+ end
151
+
152
+ def quote_ident(k)
153
+ BACKTICK + connection.escape(k.to_s) + BACKTICK
154
+ end
155
+
156
+ def quote_big_decimal(v)
157
+ v.to_s('F')
158
+ end
159
+ end
160
+ end
@@ -0,0 +1,84 @@
1
+ require 'upsert/pg_connection/column_definition'
2
+
3
+ class Upsert
4
+ # @private
5
+ module PG_Connection
6
+
7
+ attr_reader :merge_function
8
+
9
+ def chunk
10
+ return if rows.empty?
11
+ row = rows.shift
12
+ unless merge_function
13
+ create_merge_function row
14
+ end
15
+ hsh = row.to_hash
16
+ ordered_args = column_definitions.map do |c|
17
+ hsh[c.name]
18
+ end
19
+ %{SELECT #{merge_function}(#{quote_values(ordered_args)})}
20
+ end
21
+
22
+ def execute(sql)
23
+ connection.exec sql
24
+ end
25
+
26
+ def quote_string(v)
27
+ SINGLE_QUOTE + connection.escape_string(v) + SINGLE_QUOTE
28
+ end
29
+
30
+ def quote_binary(v)
31
+ E_AND_SINGLE_QUOTE + connection.escape_bytea(v) + SINGLE_QUOTE
32
+ end
33
+
34
+ def quote_time(v)
35
+ quote_string [v.strftime(ISO8601_DATETIME), sprintf(USEC_SPRINTF, v.usec)].join('.')
36
+ end
37
+
38
+ def quote_big_decimal(v)
39
+ v.to_s('F')
40
+ end
41
+
42
+ def quote_boolean(v)
43
+ v ? 'TRUE' : 'FALSE'
44
+ end
45
+
46
+ def quote_ident(k)
47
+ connection.quote_ident k.to_s
48
+ end
49
+
50
+ def column_definitions
51
+ @column_definitions ||= ColumnDefinition.all(connection, table_name)
52
+ end
53
+
54
+ private
55
+
56
+ def create_merge_function(example_row)
57
+ @merge_function = "pg_temp.merge_#{table_name}_#{Kernel.rand(1e11)}"
58
+ execute <<-EOS
59
+ CREATE FUNCTION #{merge_function}(#{column_definitions.map { |c| "#{quote_ident(c.input_name)} #{c.sql_type} DEFAULT #{c.default || 'NULL'}" }.join(',') }) RETURNS VOID AS
60
+ $$
61
+ BEGIN
62
+ LOOP
63
+ -- first try to update the key
64
+ UPDATE #{table_name} SET #{column_definitions.map { |c| "#{quote_ident(c.name)} = #{quote_ident(c.input_name)}" }.join(',')} WHERE #{example_row.selector.keys.map { |k| "#{quote_ident(k)} = #{quote_ident([k,'input'].join('_'))}" }.join(' AND ') };
65
+ IF found THEN
66
+ RETURN;
67
+ END IF;
68
+ -- not there, so try to insert the key
69
+ -- if someone else inserts the same key concurrently,
70
+ -- we could get a unique-key failure
71
+ BEGIN
72
+ INSERT INTO #{table_name}(#{column_definitions.map { |c| quote_ident(c.name) }.join(',')}) VALUES (#{column_definitions.map { |c| quote_ident(c.input_name) }.join(',')});
73
+ RETURN;
74
+ EXCEPTION WHEN unique_violation THEN
75
+ -- Do nothing, and loop to try the UPDATE again.
76
+ END;
77
+ END LOOP;
78
+ END;
79
+ $$
80
+ LANGUAGE plpgsql;
81
+ EOS
82
+ end
83
+ end
84
+ end