upsert 0.2.1 → 0.2.2

Sign up to get free protection for your applications and to get access to all the features.
data/CHANGELOG CHANGED
@@ -1,3 +1,9 @@
1
+ 0.2.2 / 2012-06-21
2
+
3
+ * Bug fixes
4
+
5
+ * Correct and simplify how sql length is calculated when batching (streaming) MySQL upserts.
6
+
1
7
  0.2.1 / 2012-06-21
2
8
 
3
9
  * Enhancements
data/README.md CHANGED
@@ -33,7 +33,7 @@ For bulk upserts, you probably still want to use `Upsert.stream`.
33
33
  Pet.upsert({:name => 'Jerry'}, :breed => 'beagle')
34
34
  Pet.upsert({:name => 'Pierre'}, :breed => 'tabby')
35
35
 
36
- ### Gotchas
36
+ ### The "fixed column set" gotcha
37
37
 
38
38
  Currently, the first row you pass in determines the columns that will be used. That's useful for mass importing of many rows with the same columns, but is surprising if you're trying to use a single `Upsert` object to add arbitrary data. For example, this won't work:
39
39
 
@@ -47,7 +47,14 @@ You would need to use a new `Upsert` object. On the other hand, this is totally
47
47
  Pet.upsert({:name => 'Jerry'}, :breed => 'beagle')
48
48
  Pet.upsert({:tag_number => 456}, :spiel => 'great cat')
49
49
 
50
- Please send in a pull request if you think there's a better way!
50
+ ## Wishlist
51
+
52
+ Pull requests for any of these would be greatly appreciated:
53
+
54
+ 1. Somebody who understands statistics should look at how I'm sampling rows in `Upsert::Mysql2_Client#estimate_variable_sql_bytesize`... I think we can assume that row sizes are random, so I don't think we actually have to select random elements.
55
+ 2. Fix SQLite tests.
56
+ 3. If you think there's a fix for the "fixed column set" gotcha...
57
+ 4. Naming suggestions: should "document" be called "setters" or "attributes"? Should "stream" be "batch" instead?
51
58
 
52
59
  ## Real-world usage
53
60
 
@@ -49,6 +49,7 @@ class Upsert
49
49
  USEC_SPRINTF = '%06d'
50
50
  ISO8601_DATETIME = '%Y-%m-%d %H:%M:%S'
51
51
  ISO8601_DATE = '%F'
52
+ NULL_WORD = 'NULL'
52
53
 
53
54
  # @return [Mysql2::Client,Sqlite3::Database,PG::Connection,#raw_connection]
54
55
  attr_reader :connection
@@ -120,7 +121,7 @@ class Upsert
120
121
  def quote_value(v)
121
122
  case v
122
123
  when NilClass
123
- 'NULL'
124
+ NULL_WORD
124
125
  when Upsert::Binary
125
126
  quote_binary v # must be defined by base
126
127
  when String
@@ -141,19 +142,4 @@ class Upsert
141
142
  raise "not sure how to quote #{v.class}: #{v.inspect}"
142
143
  end
143
144
  end
144
-
145
- # @private
146
- def quote_idents(idents)
147
- idents.map { |k| quote_ident(k) }.join(',') # must be defined by base
148
- end
149
-
150
- # @private
151
- def quote_values(values)
152
- values.map { |v| quote_value(v) }.join(',')
153
- end
154
-
155
- # @private
156
- def quote_pairs(pairs)
157
- pairs.map { |k, v| [quote_ident(k),quote_value(v)].join('=') }.join(',')
158
- end
159
145
  end
@@ -1,6 +1,8 @@
1
1
  class Upsert
2
2
  # @private
3
3
  module Mysql2_Client
4
+ SAMPLE = 0.1
5
+
4
6
  def chunk
5
7
  return if rows.empty?
6
8
  all = rows.length
@@ -11,9 +13,9 @@ class Upsert
11
13
  if async? and take == all
12
14
  return
13
15
  end
14
- while take > 1 and oversize?(take)
16
+ while take > 2 and oversize?(take)
15
17
  $stderr.puts " Length prediction via sampling failed, shrinking" if ENV['UPSERT_DEBUG'] == 'true'
16
- take -= 1
18
+ take -= 2
17
19
  end
18
20
  chunk = sql take
19
21
  while take > 1 and chunk.bytesize > max_sql_bytesize
@@ -46,14 +48,13 @@ class Upsert
46
48
  end
47
49
 
48
50
  def insert_part
49
- @insert_part ||= %{INSERT INTO "#{table_name}" (#{quote_idents(columns)}) VALUES }
51
+ @insert_part ||= %{INSERT INTO "#{table_name}" (#{columns.join(',')}) VALUES }
50
52
  end
51
53
 
52
54
  def update_part
53
55
  @update_part ||= begin
54
56
  updaters = columns.map do |k|
55
- qk = quote_ident k
56
- [ qk, "VALUES(#{qk})" ].join('=')
57
+ [ k, "VALUES(#{k})" ].join('=')
57
58
  end.join(',')
58
59
  %{ ON DUPLICATE KEY UPDATE #{updaters}}
59
60
  end
@@ -64,13 +65,18 @@ class Upsert
64
65
  @static_sql_bytesize ||= insert_part.bytesize + update_part.bytesize + 2
65
66
  end
66
67
 
67
- # where 3 is parens and comma
68
+
68
69
  def variable_sql_bytesize(take)
69
- rows.first(take).inject(0) { |sum, row| sum + row.values_sql_bytesize + 3 }
70
+ memo = rows.first(take).inject(0) { |sum, row| sum + row.values_sql_bytesize }
71
+ if take > 0
72
+ # parens and comma
73
+ memo += 3*(take-1)
74
+ end
75
+ memo
70
76
  end
71
77
 
72
78
  def estimate_variable_sql_bytesize(take)
73
- n = (take / 10.0).ceil
79
+ n = (take * SAMPLE).ceil
74
80
  sample = if RUBY_VERSION >= '1.9'
75
81
  rows.first(take).sample(n)
76
82
  else
@@ -82,7 +88,12 @@ class Upsert
82
88
  end
83
89
  memo.first(n)
84
90
  end
85
- 10.0 * sample.inject(0) { |sum, row| sum + row.values_sql_bytesize + 3 }
91
+ memo = sample.inject(0) { |sum, row| sum + row.values_sql_bytesize } / SAMPLE
92
+ if take > 0
93
+ # parens and comma
94
+ memo += 3*(take-1)
95
+ end
96
+ memo
86
97
  end
87
98
 
88
99
  def sql_bytesize(take)
@@ -112,33 +123,6 @@ class Upsert
112
123
  end
113
124
  end
114
125
 
115
- def quoted_value_bytesize(v)
116
- case v
117
- when NilClass
118
- 4
119
- when TrueClass
120
- 4
121
- when FalseClass
122
- 5
123
- when BigDecimal
124
- v.to_s('F').bytesize
125
- when Upsert::Binary
126
- v.bytesize * 2 + 3
127
- when Numeric
128
- v.to_s.bytesize
129
- when String
130
- v.bytesize + 2
131
- when Symbol
132
- v.to_s.bytesize + 2
133
- when Time, DateTime
134
- 24 + 2
135
- when Date
136
- 10 + 2
137
- else
138
- raise "not sure how to get quoted length of #{v.class}: #{v.inspect}"
139
- end
140
- end
141
-
142
126
  def quote_boolean(v)
143
127
  v ? 'TRUE' : 'FALSE'
144
128
  end
@@ -14,9 +14,9 @@ class Upsert
14
14
  end
15
15
  hsh = row.to_hash
16
16
  ordered_args = column_definitions.map do |c|
17
- hsh[c.name]
17
+ hsh[c.name] || NULL_WORD
18
18
  end
19
- %{SELECT #{merge_function}(#{quote_values(ordered_args)})}
19
+ %{SELECT #{merge_function}(#{ordered_args.join(',')})}
20
20
  end
21
21
 
22
22
  def execute(sql)
@@ -56,12 +56,12 @@ class Upsert
56
56
  def create_merge_function(example_row)
57
57
  @merge_function = "pg_temp.merge_#{table_name}_#{Kernel.rand(1e11)}"
58
58
  execute <<-EOS
59
- CREATE FUNCTION #{merge_function}(#{column_definitions.map { |c| "#{quote_ident(c.input_name)} #{c.sql_type} DEFAULT #{c.default || 'NULL'}" }.join(',') }) RETURNS VOID AS
59
+ CREATE FUNCTION #{merge_function}(#{column_definitions.map { |c| "#{c.input_name} #{c.sql_type} DEFAULT #{c.default || 'NULL'}" }.join(',') }) RETURNS VOID AS
60
60
  $$
61
61
  BEGIN
62
62
  LOOP
63
63
  -- first try to update the key
64
- UPDATE #{table_name} SET #{column_definitions.map { |c| "#{quote_ident(c.name)} = #{quote_ident(c.input_name)}" }.join(',')} WHERE #{example_row.selector.keys.map { |k| "#{quote_ident(k)} = #{quote_ident([k,'input'].join('_'))}" }.join(' AND ') };
64
+ UPDATE #{table_name} SET #{column_definitions.map { |c| "#{c.name} = #{c.input_name}" }.join(',')} WHERE #{example_row.raw_selector.keys.map { |k| "#{quote_ident(k)} = #{quote_ident([k,'input'].join('_'))}" }.join(' AND ') };
65
65
  IF found THEN
66
66
  RETURN;
67
67
  END IF;
@@ -69,7 +69,7 @@ BEGIN
69
69
  -- if someone else inserts the same key concurrently,
70
70
  -- we could get a unique-key failure
71
71
  BEGIN
72
- INSERT INTO #{table_name}(#{column_definitions.map { |c| quote_ident(c.name) }.join(',')}) VALUES (#{column_definitions.map { |c| quote_ident(c.input_name) }.join(',')});
72
+ INSERT INTO #{table_name}(#{column_definitions.map { |c| c.name }.join(',')}) VALUES (#{column_definitions.map { |c| c.input_name }.join(',')});
73
73
  RETURN;
74
74
  EXCEPTION WHEN unique_violation THEN
75
75
  -- Do nothing, and loop to try the UPDATE again.
@@ -39,7 +39,7 @@ EOS
39
39
  res.reject do |row|
40
40
  row['name'] == auto_increment_primary_key
41
41
  end.map do |row|
42
- new row['name'], row['sql_type'], row['default']
42
+ new connection, row['name'], row['sql_type'], row['default']
43
43
  end
44
44
  end
45
45
  end
@@ -49,9 +49,9 @@ EOS
49
49
  attr_reader :sql_type
50
50
  attr_reader :default
51
51
 
52
- def initialize(name, sql_type, default)
53
- @name = name
54
- @input_name = "#{name}_input"
52
+ def initialize(connection, raw_name, sql_type, default)
53
+ @name = connection.quote_ident raw_name
54
+ @input_name = connection.quote_ident "#{raw_name}_input"
55
55
  @sql_type = sql_type
56
56
  @default = default
57
57
  end
@@ -2,13 +2,21 @@ class Upsert
2
2
  # @private
3
3
  class Row
4
4
  attr_reader :parent
5
+ attr_reader :raw_selector
5
6
  attr_reader :selector
6
7
  attr_reader :document
7
8
 
8
- def initialize(parent, selector, document)
9
+ def initialize(parent, raw_selector, raw_document)
9
10
  @parent = parent
10
- @selector = selector
11
- @document = document
11
+ @raw_selector = raw_selector
12
+ @selector = raw_selector.inject({}) do |memo, (k, v)|
13
+ memo[parent.quote_ident(k)] = parent.quote_value(v)
14
+ memo
15
+ end
16
+ @document = raw_document.inject({}) do |memo, (k, v)|
17
+ memo[parent.quote_ident(k)] = parent.quote_value(v)
18
+ memo
19
+ end
12
20
  end
13
21
 
14
22
  def columns
@@ -16,40 +24,40 @@ class Upsert
16
24
  end
17
25
 
18
26
  def values_sql_bytesize
19
- @values_sql_bytesize ||= pairs.inject(0) { |sum, (_, v)| sum + parent.quoted_value_bytesize(v) }
27
+ @values_sql_bytesize ||= pairs.inject(0) { |sum, (_, v)| sum + v.to_s.bytesize } + columns.length - 1
20
28
  end
21
29
 
22
30
  def values_sql
23
- parent.quote_values pairs.map { |_, v| v }
31
+ pairs.map { |_, v| v }.join(',')
24
32
  end
25
33
 
26
34
  def columns_sql
27
- parent.quote_idents columns
35
+ pairs.map { |k, _| k }.join(',')
28
36
  end
29
37
 
30
38
  def where_sql
31
- parent.quote_pairs selector
39
+ selector.map { |k, v| [k, v].join('=') }.join(',')
32
40
  end
33
41
 
34
42
  def set_sql
35
- parent.quote_pairs pairs
43
+ pairs.map { |k, v| [k, v].join('=') }.join(',')
36
44
  end
37
45
 
38
46
  def pairs
39
47
  @pairs ||= columns.map do |k|
40
- value = if document.has_key?(k)
48
+ v = if document.has_key?(k)
41
49
  # prefer the document so that you can change rows
42
50
  document[k]
43
51
  else
44
52
  selector[k]
45
53
  end
46
- [ k, value ]
54
+ [ k, v ]
47
55
  end
48
56
  end
49
57
 
50
58
  def to_hash
51
59
  @to_hash ||= pairs.inject({}) do |memo, (k, v)|
52
- memo[k.to_s] = v
60
+ memo[k] = v
53
61
  memo
54
62
  end
55
63
  end
@@ -1,3 +1,3 @@
1
1
  class Upsert
2
- VERSION = "0.2.1"
2
+ VERSION = "0.2.2"
3
3
  end
@@ -4,7 +4,7 @@ require 'mysql2'
4
4
  system %{ mysql -u root -ppassword -e "DROP DATABASE IF EXISTS test_upsert; CREATE DATABASE test_upsert CHARSET utf8" }
5
5
  ActiveRecord::Base.establish_connection :adapter => 'mysql2', :username => 'root', :password => 'password', :database => 'test_upsert'
6
6
 
7
- describe "upserting on mysql2" do
7
+ describe Upsert::Mysql2_Client do
8
8
  before do
9
9
  @opened_connections = []
10
10
  ActiveRecord::Base.connection.drop_table(Pet.table_name) rescue nil
@@ -38,4 +38,69 @@ describe "upserting on mysql2" do
38
38
  it_also "doesn't mess with timezones"
39
39
 
40
40
  it_also "doesn't blow up on reserved words"
41
+
42
+ describe '#sql_bytesize' do
43
+ def assert_exact(selector_proc, document_proc, show = false)
44
+ upsert = Upsert.new connection, :pets
45
+ 0.upto(256) do |i|
46
+ upsert.rows << Upsert::Row.new(upsert, selector_proc.call(i), document_proc.call(i))
47
+ i.upto(upsert.rows.length) do |take|
48
+ expected_sql = upsert.sql(take)
49
+ actual = upsert.sql_bytesize(take)
50
+ if show and actual != expected_sql.bytesize
51
+ $stderr.puts
52
+ $stderr.puts "Expected: #{expected_sql.bytesize}"
53
+ $stderr.puts "Actual: #{actual}"
54
+ $stderr.puts expected_sql
55
+ end
56
+ actual.must_equal expected_sql.bytesize
57
+ end
58
+ end
59
+ end
60
+ def rand_string(length)
61
+ # http://www.dzone.com/snippets/generate-random-string-letters
62
+ # Array.new(length) { (rand(122-97) + 97).chr }.join
63
+ if RUBY_VERSION >= '1.9'
64
+ Array.new(length) { rand(512).chr(Encoding::UTF_8) }.join
65
+ else
66
+ Array.new(length) { rand(512) }.pack('C*')
67
+ end
68
+ end
69
+ it "is exact as selector length changes" do
70
+ selector_proc = proc do |i|
71
+ { :name => rand_string(i) }
72
+ end
73
+ document_proc = proc do |i|
74
+ {}
75
+ end
76
+ assert_exact selector_proc, document_proc
77
+ end
78
+ it "is exact as value length changes" do
79
+ selector_proc = proc do |i|
80
+ { :name => 'Jerry' }
81
+ end
82
+ document_proc = proc do |i|
83
+ { :spiel => rand_string(i) }
84
+ end
85
+ assert_exact selector_proc, document_proc
86
+ end
87
+ it "is exact as both selector and value length change" do
88
+ selector_proc = proc do |i|
89
+ { :name => rand_string(i) }
90
+ end
91
+ document_proc = proc do |i|
92
+ { :spiel => rand_string(i) }
93
+ end
94
+ assert_exact selector_proc, document_proc
95
+ end
96
+ it "is exact with numbers too" do
97
+ selector_proc = proc do |i|
98
+ { :tag_number => rand(1e5) }
99
+ end
100
+ document_proc = proc do |i|
101
+ { :lovability => rand }
102
+ end
103
+ assert_exact selector_proc, document_proc
104
+ end
105
+ end
41
106
  end
@@ -5,7 +5,7 @@ system %{ dropdb test_upsert }
5
5
  system %{ createdb test_upsert }
6
6
  ActiveRecord::Base.establish_connection :adapter => 'postgresql', :database => 'test_upsert'
7
7
 
8
- describe "upserting on postgresql" do
8
+ describe Upsert::PG_Connection do
9
9
  before do
10
10
  @opened_connections = []
11
11
  ActiveRecord::Base.connection.drop_table(Pet.table_name) rescue nil
@@ -6,7 +6,7 @@ FileUtils.mkdir_p File.dirname(db_path)
6
6
  FileUtils.rm_f db_path
7
7
  ActiveRecord::Base.establish_connection :adapter => 'sqlite3', :database => db_path
8
8
 
9
- describe "upserting on sqlite" do
9
+ describe Upsert::SQLite3_Database do
10
10
  before do
11
11
  @opened_connections = []
12
12
  ActiveRecord::Base.connection.drop_table(Pet.table_name) rescue nil
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: upsert
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.1
4
+ version: 0.2.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors: