postgres_upsert 3.0.0 → 4.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +13 -5
- data/.gitignore +1 -0
- data/.travis.yml +9 -0
- data/Gemfile.lock +93 -54
- data/README.md +106 -40
- data/Rakefile +4 -16
- data/bin/bundle +3 -0
- data/bin/rails +4 -0
- data/bin/rake +4 -0
- data/bin/setup +29 -0
- data/config.ru +4 -0
- data/config/application.rb +24 -0
- data/config/boot.rb +3 -0
- data/config/database.yml +22 -0
- data/config/environment.rb +5 -0
- data/config/environments/development.rb +41 -0
- data/config/environments/production.rb +79 -0
- data/config/environments/test.rb +42 -0
- data/config/locales/en.yml +23 -0
- data/config/routes.rb +56 -0
- data/config/secrets.yml +22 -0
- data/db/migrate/20150214192135_create_test_tables.rb +19 -0
- data/db/schema.rb +28 -0
- data/db/seeds.rb +7 -0
- data/lib/postgres_upsert.rb +8 -5
- data/lib/postgres_upsert/result.rb +11 -0
- data/lib/postgres_upsert/table_writer.rb +46 -0
- data/lib/postgres_upsert/writer.rb +25 -54
- data/postgres_upsert.gemspec +2 -2
- data/spec/pg_upsert_csv_spec.rb +87 -29
- data/spec/spec_helper.rb +5 -37
- metadata +42 -28
- data/lib/postgres_upsert/active_record.rb +0 -13
- data/spec/fixtures/2_col_binary_data.dat +0 -0
- data/spec/pg_upsert_binary_spec.rb +0 -35
- data/spec/spec.opts +0 -1
checksums.yaml
CHANGED
@@ -1,7 +1,15 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
|
2
|
+
!binary "U0hBMQ==":
|
3
|
+
metadata.gz: !binary |-
|
4
|
+
OGQ4YTc4MTFmYzBjYzE1MWY3ODI3NGQ5Yjc5Njk4Mjk2MGJlZjdhNg==
|
5
|
+
data.tar.gz: !binary |-
|
6
|
+
ZmNhNzkxNDg4NzhlOWFhYjRjMWNkZDI5ZTMyMGQwMjZlMmM2ZjI4NQ==
|
5
7
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
|
8
|
+
metadata.gz: !binary |-
|
9
|
+
ODU0N2IzZmM4ZDc1ODgzYzZmMDM1YzEyNTgwN2Y3MWUyMjI0ZmIxNjIzMjEy
|
10
|
+
M2YyOGFlZjRiM2MzMjcwMTU2MzZjM2IyNWExNWQ3YTFiMzk4ZDIzYWY0MzZi
|
11
|
+
ZTNmNGUwMDAxZDQxNGUwYjI4MTk0MWU3OWYxZDYwMzE5YmU0YjE=
|
12
|
+
data.tar.gz: !binary |-
|
13
|
+
ZTVmMDBkMmFjZDRkNzIwZjI0MDNhNWRkZjlkMjg4YWU2YTI1MGM4NDVjMWY4
|
14
|
+
MzI2NDE2MzMwOWQzY2Y3N2UwMjYxZDY1YzE4YTJkMDc2ZDg1M2MyZGVlN2Uz
|
15
|
+
NzA0YjQ1YWUyNTBjMDhmNDNmZTNmYTNkZjE4ZmY0YTk2YWYxNzY=
|
data/.gitignore
CHANGED
data/.travis.yml
ADDED
data/Gemfile.lock
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
postgres_upsert (
|
4
|
+
postgres_upsert (4.0.0)
|
5
5
|
activerecord (>= 3.0.0)
|
6
6
|
pg (~> 0.17.0)
|
7
7
|
rails (>= 3.0.0)
|
@@ -9,44 +9,62 @@ PATH
|
|
9
9
|
GEM
|
10
10
|
remote: https://rubygems.org/
|
11
11
|
specs:
|
12
|
-
actionmailer (4.0
|
13
|
-
actionpack (= 4.0
|
12
|
+
actionmailer (4.2.0)
|
13
|
+
actionpack (= 4.2.0)
|
14
|
+
actionview (= 4.2.0)
|
15
|
+
activejob (= 4.2.0)
|
14
16
|
mail (~> 2.5, >= 2.5.4)
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
rack (~> 1.
|
17
|
+
rails-dom-testing (~> 1.0, >= 1.0.5)
|
18
|
+
actionpack (4.2.0)
|
19
|
+
actionview (= 4.2.0)
|
20
|
+
activesupport (= 4.2.0)
|
21
|
+
rack (~> 1.6.0)
|
20
22
|
rack-test (~> 0.6.2)
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
23
|
+
rails-dom-testing (~> 1.0, >= 1.0.5)
|
24
|
+
rails-html-sanitizer (~> 1.0, >= 1.0.1)
|
25
|
+
actionview (4.2.0)
|
26
|
+
activesupport (= 4.2.0)
|
27
|
+
builder (~> 3.1)
|
28
|
+
erubis (~> 2.7.0)
|
29
|
+
rails-dom-testing (~> 1.0, >= 1.0.5)
|
30
|
+
rails-html-sanitizer (~> 1.0, >= 1.0.1)
|
31
|
+
activejob (4.2.0)
|
32
|
+
activesupport (= 4.2.0)
|
33
|
+
globalid (>= 0.3.0)
|
34
|
+
activemodel (4.2.0)
|
35
|
+
activesupport (= 4.2.0)
|
36
|
+
builder (~> 3.1)
|
37
|
+
activerecord (4.2.0)
|
38
|
+
activemodel (= 4.2.0)
|
39
|
+
activesupport (= 4.2.0)
|
40
|
+
arel (~> 6.0)
|
41
|
+
activesupport (4.2.0)
|
42
|
+
i18n (~> 0.7)
|
43
|
+
json (~> 1.7, >= 1.7.7)
|
44
|
+
minitest (~> 5.1)
|
45
|
+
thread_safe (~> 0.3, >= 0.3.4)
|
46
|
+
tzinfo (~> 1.1)
|
47
|
+
arel (6.0.0)
|
48
|
+
builder (3.2.2)
|
38
49
|
coderay (1.1.0)
|
39
50
|
diff-lcs (1.1.3)
|
40
51
|
erubis (2.7.0)
|
52
|
+
globalid (0.3.3)
|
53
|
+
activesupport (>= 4.1.0)
|
41
54
|
hike (1.2.3)
|
42
|
-
i18n (0.
|
43
|
-
json (1.
|
55
|
+
i18n (0.7.0)
|
56
|
+
json (1.8.2)
|
57
|
+
loofah (2.0.1)
|
58
|
+
nokogiri (>= 1.5.9)
|
44
59
|
mail (2.6.3)
|
45
60
|
mime-types (>= 1.16, < 3)
|
46
61
|
method_source (0.8.2)
|
47
62
|
mime-types (2.4.3)
|
48
|
-
|
63
|
+
mini_portile (0.6.2)
|
64
|
+
minitest (5.5.1)
|
49
65
|
multi_json (1.10.1)
|
66
|
+
nokogiri (1.6.6.2)
|
67
|
+
mini_portile (~> 0.6.0)
|
50
68
|
pg (0.17.1)
|
51
69
|
pry (0.10.1)
|
52
70
|
coderay (~> 1.1.0)
|
@@ -54,47 +72,68 @@ GEM
|
|
54
72
|
slop (~> 3.4)
|
55
73
|
pry-rails (0.3.2)
|
56
74
|
pry (>= 0.9.10)
|
57
|
-
rack (1.
|
58
|
-
rack-test (0.6.
|
75
|
+
rack (1.6.0)
|
76
|
+
rack-test (0.6.3)
|
59
77
|
rack (>= 1.0)
|
60
|
-
rails (4.0
|
61
|
-
actionmailer (= 4.0
|
62
|
-
actionpack (= 4.0
|
63
|
-
|
64
|
-
|
78
|
+
rails (4.2.0)
|
79
|
+
actionmailer (= 4.2.0)
|
80
|
+
actionpack (= 4.2.0)
|
81
|
+
actionview (= 4.2.0)
|
82
|
+
activejob (= 4.2.0)
|
83
|
+
activemodel (= 4.2.0)
|
84
|
+
activerecord (= 4.2.0)
|
85
|
+
activesupport (= 4.2.0)
|
65
86
|
bundler (>= 1.3.0, < 2.0)
|
66
|
-
railties (= 4.0
|
67
|
-
sprockets-rails
|
68
|
-
|
69
|
-
|
70
|
-
|
87
|
+
railties (= 4.2.0)
|
88
|
+
sprockets-rails
|
89
|
+
rails-deprecated_sanitizer (1.0.3)
|
90
|
+
activesupport (>= 4.2.0.alpha)
|
91
|
+
rails-dom-testing (1.0.5)
|
92
|
+
activesupport (>= 4.2.0.beta, < 5.0)
|
93
|
+
nokogiri (~> 1.6.0)
|
94
|
+
rails-deprecated_sanitizer (>= 1.0.1)
|
95
|
+
rails-html-sanitizer (1.0.1)
|
96
|
+
loofah (~> 2.0)
|
97
|
+
railties (4.2.0)
|
98
|
+
actionpack (= 4.2.0)
|
99
|
+
activesupport (= 4.2.0)
|
71
100
|
rake (>= 0.8.7)
|
72
101
|
thor (>= 0.18.1, < 2.0)
|
73
|
-
rake (10.
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
rspec-
|
78
|
-
|
79
|
-
rspec-
|
80
|
-
rspec-core (2.
|
81
|
-
rspec-expectations (2.
|
82
|
-
diff-lcs (
|
83
|
-
rspec-mocks (2.
|
102
|
+
rake (10.4.2)
|
103
|
+
rspec (2.99.0)
|
104
|
+
rspec-core (~> 2.99.0)
|
105
|
+
rspec-expectations (~> 2.99.0)
|
106
|
+
rspec-mocks (~> 2.99.0)
|
107
|
+
rspec-collection_matchers (1.1.2)
|
108
|
+
rspec-expectations (>= 2.99.0.beta1)
|
109
|
+
rspec-core (2.99.2)
|
110
|
+
rspec-expectations (2.99.2)
|
111
|
+
diff-lcs (>= 1.1.3, < 2.0)
|
112
|
+
rspec-mocks (2.99.3)
|
113
|
+
rspec-rails (2.99.0)
|
114
|
+
actionpack (>= 3.0)
|
115
|
+
activemodel (>= 3.0)
|
116
|
+
activesupport (>= 3.0)
|
117
|
+
railties (>= 3.0)
|
118
|
+
rspec-collection_matchers
|
119
|
+
rspec-core (~> 2.99.0)
|
120
|
+
rspec-expectations (~> 2.99.0)
|
121
|
+
rspec-mocks (~> 2.99.0)
|
84
122
|
slop (3.6.0)
|
85
123
|
sprockets (2.12.3)
|
86
124
|
hike (~> 1.2)
|
87
125
|
multi_json (~> 1.0)
|
88
126
|
rack (~> 1.0)
|
89
127
|
tilt (~> 1.1, != 1.3.0)
|
90
|
-
sprockets-rails (2.2.
|
128
|
+
sprockets-rails (2.2.4)
|
91
129
|
actionpack (>= 3.0)
|
92
130
|
activesupport (>= 3.0)
|
93
131
|
sprockets (>= 2.8, < 4.0)
|
94
132
|
thor (0.19.1)
|
95
133
|
thread_safe (0.3.4)
|
96
134
|
tilt (1.4.1)
|
97
|
-
tzinfo (
|
135
|
+
tzinfo (1.2.2)
|
136
|
+
thread_safe (~> 0.1)
|
98
137
|
|
99
138
|
PLATFORMS
|
100
139
|
ruby
|
@@ -103,5 +142,5 @@ DEPENDENCIES
|
|
103
142
|
bundler
|
104
143
|
postgres_upsert!
|
105
144
|
pry-rails
|
106
|
-
rdoc
|
107
145
|
rspec (~> 2.12)
|
146
|
+
rspec-rails (~> 2.0)
|
data/README.md
CHANGED
@@ -1,9 +1,11 @@
|
|
1
|
-
# postgres_upsert
|
1
|
+
# postgres_upsert [![Build Status](https://travis-ci.org/theSteveMitchell/postgres_upsert.svg?branch=master)](https://travis-ci.org/theSteveMitchell/postgres_upsert)
|
2
2
|
|
3
3
|
Allows your rails app to load data in a very fast way, avoiding calls to ActiveRecord.
|
4
4
|
|
5
5
|
Using the PG gem and postgres's powerful COPY command, you can create thousands of rails objects in your db in a single query.
|
6
6
|
|
7
|
+
## Compatibility Note
|
8
|
+
The master branch requires the 'pg' gem which only supports MRI ruby. the jruby branch requires 'activerecord-jdbcpostgresql-adapter' which, of course only supports JRuby. Installation is the same whatever your platform.
|
7
9
|
|
8
10
|
## Install
|
9
11
|
|
@@ -17,71 +19,135 @@ Run the bundle command
|
|
17
19
|
|
18
20
|
## Usage
|
19
21
|
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
io_object_or_file_path
|
22
|
+
```ruby
|
23
|
+
PostgresUpsert.write <class_or_table_name>, <io_object_or_file_path>[, options]
|
24
|
+
```
|
25
|
+
<class_or_table_name> is either an ActiveRecord::Base subclass, or a string representing the name of a database table.
|
26
|
+
<io_object_or_file_path> can be either a string representing a file path, or an io object (StringIO, FileIO, etc.)
|
25
27
|
|
26
28
|
options:
|
27
|
-
:delimiter - the string to use to delimit fields. Default is ","
|
28
|
-
:
|
29
|
-
:
|
30
|
-
:
|
31
|
-
:update_only => when true, postgres_upsert will ONLY update existing records, and not insert new. Default is false.
|
32
|
-
|
33
|
-
pg_upsert will allow you to copy data from an arbritary IO object or from a file in the database server (when you pass the path as string).
|
34
|
-
Let's first copy from a file in the database server, assuming again that we have a users table and
|
35
|
-
that we are in the Rails console:
|
29
|
+
- :delimiter - the string to use to delimit fields from the source data. Default is ","
|
30
|
+
- :header => specifies if the file/io source contains a header row. Either :header option must be true, or :columns list must be passed. Default true
|
31
|
+
- :key_column => the primary key or unique key column on your destination table, used to distinguish new records from existing records. Default is the primary_key of your destination table/model.
|
32
|
+
- :update_only => when true, postgres_upsert will ONLY update existing records, and not insert new. Default is false.
|
36
33
|
|
34
|
+
## Examples
|
35
|
+
for these examples let's assume we have a users table and model:
|
37
36
|
```ruby
|
38
|
-
User
|
37
|
+
class User < ActiveRecord::Base
|
38
|
+
```
|
39
|
+
In the rails console we can run:
|
40
|
+
```ruby
|
41
|
+
PostgresUpsert.write User, "/tmp/users.csv"
|
39
42
|
```
|
40
43
|
|
41
|
-
This command will use the headers in the CSV file as fields of the target table
|
42
|
-
If the
|
43
|
-
|
44
|
+
This command will use the headers in the CSV file as fields of the target table (by default)
|
45
|
+
If the CSV file's header does not match the field names of the User class, you can pass a map in the options parameter.
|
44
46
|
```ruby
|
45
|
-
|
47
|
+
PostgresUpsert.write "users", "/tmp/users.csv", :map => {'name' => 'first_name'}
|
46
48
|
```
|
49
|
+
The `name` column in the CSV file will be mapped to the `first_name` field in the users table.
|
50
|
+
|
51
|
+
postgres_upsert supports 'merge' operations, which is not yet natively supported in Postgres. The data can include both new and existing records, and postgres_upsert will handle either update or insert of records appropriately. Since the Postgres COPY command does not handle this, postgres_upsert accomplishes it using an intermediary temp table.
|
52
|
+
|
53
|
+
The merge/upsert happens in 5 steps (assume your data table is called "users")
|
54
|
+
* create a temp table named users_temp_123 where "123" is a random int. In postgres temp tables are only visible to the current database session, so naming conflicts should not be a problem. We add this random suffix just for additional safety.
|
55
|
+
* COPY the data to user_temp
|
56
|
+
* issue a query to insert all new records from users_temp_123 into users ("new" records are those records whos primary key does not already exist in the users)
|
57
|
+
* issue a query to update all existing records in users with the data in users_temp_123 ("existing" records are those whose primary key already exists in the users table)
|
58
|
+
* drop the temp table.
|
59
|
+
|
60
|
+
## timestamp columns
|
61
|
+
|
62
|
+
currently postgres_upsert detects and manages the default rails timestamp columns `created_at` and `updated_at`. If these fields exist in your destination table, postgres_upsert will keep these current as expected. I recommend you do NOT include these fields in your source CSV/IO, as postgres_upsert will not honor them.
|
47
63
|
|
48
|
-
|
64
|
+
* newly inserted records get a current timestamp for created_at
|
65
|
+
* records existing in the source file/IO will get an update to their updated_at timestamp (even if all fields maintain the same value)
|
66
|
+
* records that are in the destination table but not the source will not have their timestamps changed.
|
49
67
|
|
50
|
-
|
68
|
+
|
69
|
+
### Overriding the key_column
|
70
|
+
|
71
|
+
By default postgres_upsert uses the primary key on your ActiveRecord table to determine if each record should be inserted or updated. You can override the column using the :key_field option:
|
51
72
|
|
52
73
|
```ruby
|
53
|
-
|
74
|
+
PostgresUpsert.write User "/tmp/users.csv", :key_column => ["external_twitter_id"]
|
54
75
|
```
|
55
76
|
|
56
|
-
|
77
|
+
obviously, the field you pass must be a unique key in your database (this is not enforced at the moment, but will be)
|
57
78
|
|
58
|
-
|
59
|
-
COPY users (id, name) FROM '/tmp/users.dat' WITH BINARY
|
60
|
-
```
|
79
|
+
passing :update_only => true will ensure that no new records are created, but records will be updated.
|
61
80
|
|
62
|
-
|
81
|
+
### Insert/Update Counts
|
82
|
+
PostgresUpsert with also return a PostgresUpsert::Result object that will tell you how many records were inserted or updated:
|
63
83
|
|
84
|
+
```ruby
|
85
|
+
User.delete_all
|
86
|
+
result = PostgresUpsert.write User "/tmp/users.csv"
|
87
|
+
result.inserted
|
88
|
+
# => 10000
|
89
|
+
result.updated
|
90
|
+
# => 0
|
91
|
+
```
|
64
92
|
|
65
|
-
|
93
|
+
### Huge Caveat!
|
94
|
+
Since postgres_upsert does not use validations or even instantiate rails objects, you can get invalid data if you're not careful. Postgres upsert assumes that your source data is minimally cleaned up, and will not tell you if any data is invalid based on rails model rules. It will, of course raise an error if data does not conform to your database constraints.
|
66
95
|
|
67
|
-
|
68
|
-
* create a temp table named users_temp_### where "###" is a random number. In postgres temp tables are only visible to the current database session, so naming conflicts should not be a problem.
|
69
|
-
* COPY the data to user_temp
|
70
|
-
* issue a query to insert all new records from users_temp_### into users (newness is determined by the presence of the primary key in the users table)
|
71
|
-
* issue a query to update all records in users with the data in users_temp_### (matching on primary key)
|
72
|
-
* drop the temp table.
|
96
|
+
### Benchmarks!
|
73
97
|
|
74
|
-
|
98
|
+
Given a User model, (validates presence of email and paassword)
|
99
|
+
```console
|
100
|
+
2.1.3 :008 > User
|
101
|
+
=> User(id: integer, email: string, password: string, created_at: datetime, updated_at: datetime)
|
102
|
+
```
|
75
103
|
|
76
|
-
|
104
|
+
And the following railsy code to create 10,000 users:
|
105
|
+
```ruby
|
106
|
+
def insert_dumb
|
107
|
+
time = Benchmark.measure do
|
108
|
+
(1..10000).each do |n|
|
109
|
+
User.create!(:email => "number#{n}@postgres.up", :password => "#{(n-5..n).to_a.join('')}")
|
110
|
+
end
|
111
|
+
end
|
112
|
+
puts time
|
113
|
+
end
|
114
|
+
```
|
77
115
|
|
116
|
+
Compared to the following code using Postgres_upsert:
|
78
117
|
```ruby
|
79
|
-
|
118
|
+
def insert_smart
|
119
|
+
time = Benchmark.measure do
|
120
|
+
csv_string = CSV.generate do |csv|
|
121
|
+
csv << %w(email password)
|
122
|
+
(1..10000).each do |n|
|
123
|
+
csv << ["number#{n}@postgres.up", "#{(n-5..n).to_a.join('')}"]
|
124
|
+
end
|
125
|
+
end
|
126
|
+
io = StringIO.new(csv_string)
|
127
|
+
PostgresUpsert.write User io, key_column: "email"
|
128
|
+
end
|
129
|
+
puts time
|
130
|
+
end
|
80
131
|
```
|
81
132
|
|
82
|
-
|
133
|
+
let's compare!
|
134
|
+
|
135
|
+
```console
|
136
|
+
2.1.3 :002 > insert_dumb
|
137
|
+
#...snip ~30k lines of output :( (10k queries, each wrapped in a transaction)
|
138
|
+
(0.3ms) COMMIT
|
139
|
+
26.639246
|
140
|
+
2.1.3 :004 > User.delete_all
|
141
|
+
SQL (15.4ms) DELETE FROM "users"
|
142
|
+
2.1.3 :006 > insert_smart
|
143
|
+
#...snip ~30 lines of output, composing 5 sql queries...
|
144
|
+
0.275503
|
145
|
+
```
|
146
|
+
|
147
|
+
...That's 26.6 seconds for classic create loop... vs. 0.276 seconds for postgres_upsert.
|
148
|
+
This is over 96X faster. And it only cost me ~6 extra lines of code.
|
83
149
|
|
84
|
-
|
150
|
+
Note that for the benchmark, my database is local. The performance improvement should only increase when we have network latency to worry about.
|
85
151
|
|
86
152
|
## Note on Patches/Pull Requests
|
87
153
|
|
data/Rakefile
CHANGED
@@ -1,18 +1,6 @@
|
|
1
|
-
#
|
2
|
-
|
3
|
-
require 'bundler/gem_tasks'
|
4
|
-
require 'rubygems'
|
5
|
-
require 'rspec/core/rake_task'
|
6
|
-
require 'rdoc/task'
|
1
|
+
# Add your own tasks in files placed in lib/tasks ending in .rake,
|
2
|
+
# for example lib/tasks/capistrano.rake, and they will automatically be available to Rake.
|
7
3
|
|
8
|
-
|
4
|
+
require File.expand_path('../config/application', __FILE__)
|
9
5
|
|
10
|
-
|
11
|
-
|
12
|
-
Rake::RDocTask.new do |rdoc|
|
13
|
-
version = File.exist?('VERSION') ? File.read('VERSION') : ""
|
14
|
-
rdoc.rdoc_dir = 'rdoc'
|
15
|
-
rdoc.title = "postgres_upsert #{version}"
|
16
|
-
rdoc.rdoc_files.include('README*')
|
17
|
-
rdoc.rdoc_files.include('lib/**/*.rb')
|
18
|
-
end
|
6
|
+
Rails.application.load_tasks
|