beetle_etl 0.0.16 → 0.0.19
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +50 -2
- data/lib/beetle_etl.rb +2 -2
- data/lib/beetle_etl/dsl/transformation.rb +2 -2
- data/lib/beetle_etl/naming.rb +8 -8
- data/lib/beetle_etl/step_runner/dependency_resolver.rb +8 -11
- data/lib/beetle_etl/steps/assign_ids.rb +4 -4
- data/lib/beetle_etl/steps/create_stage.rb +7 -1
- data/lib/beetle_etl/steps/load.rb +9 -21
- data/lib/beetle_etl/steps/map_relations.rb +2 -1
- data/lib/beetle_etl/steps/table_diff.rb +19 -19
- data/lib/beetle_etl/version.rb +1 -1
- data/spec/steps/load_spec.rb +13 -15
- data/spec/task_runner/dependency_resolver_spec.rb +1 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: dacfd8801cf603b9cec442ee9ea025bc14026234
|
4
|
+
data.tar.gz: 1a183e975200bb9bdc6e631130903b6ead9739be
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: bacbe3f6292e52b1ad23d38f5e128b1cf39ec63ded808310a8353956f2b11d349b4ea546012beb22e03edbd3de9bdd787ca0bbde329e1755ebacc23c9561a8ca
|
7
|
+
data.tar.gz: cbc51b9be8c35b70c21829af0c38f6db0898f4b5c38a2fc6138968bc63591e91f51d7558b2fff62196ba43789b3255f2347c9982d78de33a4d025e3da167084d
|
data/README.md
CHANGED
@@ -2,7 +2,9 @@
|
|
2
2
|
[![Build Status](https://travis-ci.org/maiwald/beetle_etl.svg?branch=master)](https://travis-ci.org/maiwald/beetle_etl)
|
3
3
|
[![Code Climate](https://codeclimate.com/github/maiwald/beetle_etl.png)](https://codeclimate.com/github/maiwald/beetle_etl)
|
4
4
|
|
5
|
-
|
5
|
+
BeetleETL helps you with synchronising relational databases and recurring imports of data. It is actually quite nice.
|
6
|
+
|
7
|
+
It currently only works with PostgreSQL databases.
|
6
8
|
|
7
9
|
## Installation
|
8
10
|
|
@@ -20,7 +22,53 @@ Or install it yourself as:
|
|
20
22
|
|
21
23
|
## Usage
|
22
24
|
|
23
|
-
|
25
|
+
### Configuration
|
26
|
+
|
27
|
+
BeetleETL.configure do |config|
|
28
|
+
config.transformation_file = # path to your imports
|
29
|
+
config.database_config = # sequel database config
|
30
|
+
# or config.database = # sequel database instance
|
31
|
+
config.external_source = ‘source_name’
|
32
|
+
config.logger = Logger.new(STDOUT)
|
33
|
+
end
|
34
|
+
|
35
|
+
### Defining Imports
|
36
|
+
|
37
|
+
Fill a file with all the tables you wish to import and write queries to select the data you want.
|
38
|
+
|
39
|
+
import :departments do
|
40
|
+
columns :name
|
41
|
+
|
42
|
+
references :organisations, on: :organisation_id
|
43
|
+
|
44
|
+
query <<-SQL
|
45
|
+
INSERT INTO #{stage_table} (
|
46
|
+
external_id,
|
47
|
+
name,
|
48
|
+
external_organisation_id
|
49
|
+
)
|
50
|
+
|
51
|
+
SELECT
|
52
|
+
o.id,
|
53
|
+
o.”dep_name”,
|
54
|
+
data.”address”
|
55
|
+
|
56
|
+
FROM ”Organisation” o
|
57
|
+
JOIN additional_data data
|
58
|
+
ON data.org_id = o.id
|
59
|
+
SQL
|
60
|
+
end
|
61
|
+
|
62
|
+
|
63
|
+
### Running BeetleETL
|
64
|
+
|
65
|
+
BeetleETL.import
|
66
|
+
|
67
|
+
## Development
|
68
|
+
|
69
|
+
To run the specs call
|
70
|
+
|
71
|
+
$ bundle exec rspec
|
24
72
|
|
25
73
|
## Contributing
|
26
74
|
|
data/lib/beetle_etl.rb
CHANGED
@@ -34,12 +34,12 @@ module BeetleETL
|
|
34
34
|
:database,
|
35
35
|
:transformation_file,
|
36
36
|
:stage_schema,
|
37
|
-
:
|
37
|
+
:target_schema,
|
38
38
|
:external_source,
|
39
39
|
:logger
|
40
40
|
|
41
41
|
def initialize
|
42
|
-
@
|
42
|
+
@target_schema = 'public'
|
43
43
|
@logger = ::Logger.new(STDOUT)
|
44
44
|
end
|
45
45
|
end
|
@@ -8,8 +8,8 @@ module BeetleETL
|
|
8
8
|
def initialize(table_name, setup, helpers = nil)
|
9
9
|
@table_name = table_name
|
10
10
|
@parsed = DSL.new(table_name).tap do |dsl|
|
11
|
-
dsl.
|
12
|
-
dsl.
|
11
|
+
dsl.instance_exec(&helpers) if helpers
|
12
|
+
dsl.instance_exec(&setup)
|
13
13
|
end
|
14
14
|
end
|
15
15
|
|
data/lib/beetle_etl/naming.rb
CHANGED
@@ -15,22 +15,22 @@ module BeetleETL
|
|
15
15
|
%Q("#{stage_table_name(table_name)}")
|
16
16
|
end
|
17
17
|
|
18
|
-
def
|
18
|
+
def target_table_name(table_name = nil)
|
19
19
|
name = (table_name || @table_name).to_s
|
20
|
-
[
|
20
|
+
[target_schema, name].compact.join('.')
|
21
21
|
end
|
22
22
|
|
23
|
-
def
|
23
|
+
def target_table_name_sql(table_name = nil)
|
24
24
|
name = (table_name || @table_name).to_s
|
25
|
-
|
26
|
-
%Q("#{
|
25
|
+
target_table_name= [target_schema, name].compact.join('"."')
|
26
|
+
%Q("#{target_table_name}")
|
27
27
|
end
|
28
28
|
|
29
29
|
private
|
30
30
|
|
31
|
-
def
|
32
|
-
|
33
|
-
|
31
|
+
def target_schema
|
32
|
+
target_schema = BeetleETL.config.target_schema
|
33
|
+
target_schema != 'public' ? target_schema : nil
|
34
34
|
end
|
35
35
|
|
36
36
|
end
|
@@ -11,7 +11,7 @@ module BeetleETL
|
|
11
11
|
|
12
12
|
def resolvables(resolved)
|
13
13
|
@items.select do |item|
|
14
|
-
|
14
|
+
!resolved.include?(item.name) && all_dependencies_met?(item, resolved)
|
15
15
|
end
|
16
16
|
end
|
17
17
|
|
@@ -22,18 +22,15 @@ module BeetleETL
|
|
22
22
|
resolved = []
|
23
23
|
|
24
24
|
until items.empty?
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
item.dependencies.subset?(resolved_names) || item.dependencies.empty?
|
29
|
-
end
|
30
|
-
|
31
|
-
raise UnsatisfiableDependenciesError if resolvable.empty?
|
32
|
-
|
33
|
-
resolvable.each { |r| items.delete r }
|
34
|
-
resolved << resolvable
|
25
|
+
resolvables = items.select { |item| all_dependencies_met?(item, resolved.map(&:name)) }
|
26
|
+
raise UnsatisfiableDependenciesError if resolvables.empty?
|
27
|
+
resolvables.each { |r| resolved << items.delete(r) }
|
35
28
|
end
|
36
29
|
end
|
37
30
|
|
31
|
+
def all_dependencies_met?(item, resolved)
|
32
|
+
item.dependencies.empty? || item.dependencies.subset?(resolved.to_set)
|
33
|
+
end
|
34
|
+
|
38
35
|
end
|
39
36
|
end
|
@@ -8,12 +8,12 @@ module BeetleETL
|
|
8
8
|
def run
|
9
9
|
database.execute <<-SQL
|
10
10
|
UPDATE #{stage_table_name_sql} stage_update
|
11
|
-
SET id = COALESCE(
|
11
|
+
SET id = COALESCE(target.id, nextval('#{table_name}_id_seq'))
|
12
12
|
FROM #{stage_table_name_sql} stage
|
13
|
-
LEFT OUTER JOIN #{
|
13
|
+
LEFT OUTER JOIN #{target_table_name_sql} target
|
14
14
|
on (
|
15
|
-
stage.external_id =
|
16
|
-
AND
|
15
|
+
stage.external_id = target.external_id
|
16
|
+
AND target.external_source = '#{external_source}'
|
17
17
|
)
|
18
18
|
WHERE stage_update.external_id = stage.external_id
|
19
19
|
SQL
|
@@ -23,6 +23,12 @@ module BeetleETL
|
|
23
23
|
|
24
24
|
#{index_definitions};
|
25
25
|
|
26
|
+
ALTER TABLE #{stage_table_name_sql}
|
27
|
+
SET (
|
28
|
+
autovacuum_enabled = false,
|
29
|
+
toast.autovacuum_enabled = false
|
30
|
+
);
|
31
|
+
|
26
32
|
TRUNCATE TABLE #{stage_table_name_sql} RESTART IDENTITY CASCADE;
|
27
33
|
SQL
|
28
34
|
end
|
@@ -70,7 +76,7 @@ module BeetleETL
|
|
70
76
|
end
|
71
77
|
|
72
78
|
def column_type(column_name)
|
73
|
-
@column_types ||= Hash[database.schema(
|
79
|
+
@column_types ||= Hash[database.schema(target_table_name.to_sym)]
|
74
80
|
.reduce({}) do |acc, (name, schema)|
|
75
81
|
acc[name.to_sym] = schema.fetch(:db_type)
|
76
82
|
acc
|
@@ -12,7 +12,7 @@ module BeetleETL
|
|
12
12
|
end
|
13
13
|
|
14
14
|
def run
|
15
|
-
%w(create update delete
|
15
|
+
%w(create update delete).each do |transition|
|
16
16
|
public_send(:"load_#{transition}")
|
17
17
|
end
|
18
18
|
end
|
@@ -25,7 +25,7 @@ module BeetleETL
|
|
25
25
|
just_now = now
|
26
26
|
|
27
27
|
database.execute <<-SQL
|
28
|
-
INSERT INTO #{
|
28
|
+
INSERT INTO #{target_table_name_sql}
|
29
29
|
(#{data_columns.join(', ')}, external_source, created_at, updated_at)
|
30
30
|
SELECT
|
31
31
|
#{data_columns.join(', ')},
|
@@ -39,13 +39,14 @@ module BeetleETL
|
|
39
39
|
|
40
40
|
def load_update
|
41
41
|
database.execute <<-SQL
|
42
|
-
UPDATE #{
|
42
|
+
UPDATE #{target_table_name_sql} target
|
43
43
|
SET
|
44
44
|
#{updatable_columns.map { |c| %Q("#{c}" = stage."#{c}") }.join(',')},
|
45
|
-
"updated_at" = '#{now}'
|
45
|
+
"updated_at" = '#{now}',
|
46
|
+
deleted_at = NULL
|
46
47
|
FROM #{stage_table_name_sql} stage
|
47
|
-
WHERE stage.id =
|
48
|
-
AND stage.transition
|
48
|
+
WHERE stage.id = target.id
|
49
|
+
AND stage.transition IN ('UPDATE', 'REINSTATE')
|
49
50
|
SQL
|
50
51
|
end
|
51
52
|
|
@@ -53,29 +54,16 @@ module BeetleETL
|
|
53
54
|
just_now = now
|
54
55
|
|
55
56
|
database.execute <<-SQL
|
56
|
-
UPDATE #{
|
57
|
+
UPDATE #{target_table_name_sql} target
|
57
58
|
SET
|
58
59
|
updated_at = '#{just_now}',
|
59
60
|
deleted_at = '#{just_now}'
|
60
61
|
FROM #{stage_table_name_sql} stage
|
61
|
-
WHERE stage.id =
|
62
|
+
WHERE stage.id = target.id
|
62
63
|
AND stage.transition = 'DELETE'
|
63
64
|
SQL
|
64
65
|
end
|
65
66
|
|
66
|
-
def load_reinstate
|
67
|
-
database.execute <<-SQL
|
68
|
-
UPDATE #{public_table_name_sql} public
|
69
|
-
SET
|
70
|
-
#{updatable_columns.map { |c| %Q("#{c}" = stage."#{c}") }.join(',')},
|
71
|
-
updated_at = '#{now}',
|
72
|
-
deleted_at = NULL
|
73
|
-
FROM #{stage_table_name_sql} stage
|
74
|
-
WHERE stage.id = public.id
|
75
|
-
AND stage.transition = 'REINSTATE'
|
76
|
-
SQL
|
77
|
-
end
|
78
|
-
|
79
67
|
private
|
80
68
|
|
81
69
|
def data_columns
|
@@ -7,7 +7,8 @@ module BeetleETL
|
|
7
7
|
end
|
8
8
|
|
9
9
|
def dependencies
|
10
|
-
|
10
|
+
result = Set.new([Transform.step_name(table_name)])
|
11
|
+
result.merge @relations.values.map { |d| AssignIds.step_name(d) }
|
11
12
|
end
|
12
13
|
|
13
14
|
def run
|
@@ -22,9 +22,9 @@ module BeetleETL
|
|
22
22
|
SET transition = 'CREATE'
|
23
23
|
WHERE NOT EXISTS (
|
24
24
|
SELECT 1
|
25
|
-
FROM #{
|
26
|
-
WHERE
|
27
|
-
AND
|
25
|
+
FROM #{target_table_name} target
|
26
|
+
WHERE target.external_id = stage.external_id
|
27
|
+
AND target.external_source = '#{external_source}'
|
28
28
|
)
|
29
29
|
SQL
|
30
30
|
end
|
@@ -35,12 +35,12 @@ module BeetleETL
|
|
35
35
|
SET transition = 'UPDATE'
|
36
36
|
WHERE EXISTS (
|
37
37
|
SELECT 1
|
38
|
-
FROM #{
|
39
|
-
WHERE
|
40
|
-
AND
|
41
|
-
AND
|
38
|
+
FROM #{target_table_name} target
|
39
|
+
WHERE target.external_id = stage.external_id
|
40
|
+
AND target.external_source = '#{external_source}'
|
41
|
+
AND target.deleted_at IS NULL
|
42
42
|
AND
|
43
|
-
(#{
|
43
|
+
(#{target_record_columns.join(', ')})
|
44
44
|
IS DISTINCT FROM
|
45
45
|
(#{stage_record_columns.join(', ')})
|
46
46
|
)
|
@@ -52,14 +52,14 @@ module BeetleETL
|
|
52
52
|
INSERT INTO #{stage_table_name_sql}
|
53
53
|
(external_id, transition)
|
54
54
|
SELECT
|
55
|
-
|
55
|
+
target.external_id,
|
56
56
|
'DELETE'
|
57
|
-
FROM #{
|
57
|
+
FROM #{target_table_name_sql} target
|
58
58
|
LEFT OUTER JOIN #{stage_table_name_sql} stage
|
59
|
-
ON (stage.external_id =
|
59
|
+
ON (stage.external_id = target.external_id)
|
60
60
|
WHERE stage.external_id IS NULL
|
61
|
-
AND
|
62
|
-
AND
|
61
|
+
AND target.external_source = '#{external_source}'
|
62
|
+
AND target.deleted_at IS NULL
|
63
63
|
SQL
|
64
64
|
end
|
65
65
|
|
@@ -69,18 +69,18 @@ module BeetleETL
|
|
69
69
|
SET transition = 'REINSTATE'
|
70
70
|
WHERE EXISTS (
|
71
71
|
SELECT 1
|
72
|
-
FROM #{
|
73
|
-
WHERE
|
74
|
-
AND
|
75
|
-
AND
|
72
|
+
FROM #{target_table_name_sql} target
|
73
|
+
WHERE target.external_id = stage.external_id
|
74
|
+
AND target.external_source = '#{external_source}'
|
75
|
+
AND target.deleted_at IS NOT NULL
|
76
76
|
)
|
77
77
|
SQL
|
78
78
|
end
|
79
79
|
|
80
80
|
private
|
81
81
|
|
82
|
-
def
|
83
|
-
prefixed_columns(data_columns, '
|
82
|
+
def target_record_columns
|
83
|
+
prefixed_columns(data_columns, 'target')
|
84
84
|
end
|
85
85
|
|
86
86
|
def stage_record_columns
|
data/lib/beetle_etl/version.rb
CHANGED
data/spec/steps/load_spec.rb
CHANGED
@@ -65,7 +65,7 @@ module BeetleETL
|
|
65
65
|
|
66
66
|
describe '#run' do
|
67
67
|
it 'runs all load steps' do
|
68
|
-
%w(create update delete
|
68
|
+
%w(create update delete).each do |transition|
|
69
69
|
expect(subject).to receive(:"load_#{transition}")
|
70
70
|
end
|
71
71
|
|
@@ -74,7 +74,7 @@ module BeetleETL
|
|
74
74
|
end
|
75
75
|
|
76
76
|
describe '#load_create' do
|
77
|
-
it 'loads records into the
|
77
|
+
it 'loads records into the target table' do
|
78
78
|
insert_into(subject.stage_table_name.to_sym).values(
|
79
79
|
[ :id , :external_id , :transition , :external_foo_id , :foo_id , :payload ] ,
|
80
80
|
[ 3 , 'external_id' , 'CREATE' , 'foo_id' , 22 , 'content' ] ,
|
@@ -108,10 +108,8 @@ module BeetleETL
|
|
108
108
|
[ 1 , 'external_id' , external_source , 33 , yesterday , now , nil , 'updated content' ] ,
|
109
109
|
)
|
110
110
|
end
|
111
|
-
end
|
112
111
|
|
113
|
-
|
114
|
-
it 'marks existing records as deleted' do
|
112
|
+
it 'restores deleted records' do
|
115
113
|
insert_into(:example_table).values(
|
116
114
|
[ :id , :external_id , :external_source , :foo_id , :created_at , :updated_at , :deleted_at , :payload ] ,
|
117
115
|
[ 1 , 'external_id' , external_source , 22 , yesterday , yesterday , nil , 'content' ] ,
|
@@ -119,20 +117,20 @@ module BeetleETL
|
|
119
117
|
|
120
118
|
insert_into(subject.stage_table_name.to_sym).values(
|
121
119
|
[ :id , :external_id , :transition , :external_foo_id , :foo_id , :payload ] ,
|
122
|
-
[ 1 , 'external_id' , '
|
120
|
+
[ 1 , 'external_id' , 'REINSTATE' , 'foo_id' , 33 , 'updated content' ] ,
|
123
121
|
)
|
124
122
|
|
125
|
-
subject.
|
123
|
+
subject.load_update
|
126
124
|
|
127
125
|
expect(:example_table).to have_values(
|
128
|
-
[ :id , :external_id , :external_source , :foo_id , :created_at , :updated_at , :deleted_at , :payload
|
129
|
-
[ 1 , 'external_id' , external_source ,
|
126
|
+
[ :id , :external_id , :external_source , :foo_id , :created_at , :updated_at , :deleted_at , :payload ] ,
|
127
|
+
[ 1 , 'external_id' , external_source , 33 , yesterday , now , nil , 'updated content' ] ,
|
130
128
|
)
|
131
129
|
end
|
132
130
|
end
|
133
131
|
|
134
|
-
describe '#
|
135
|
-
it '
|
132
|
+
describe '#load_delete' do
|
133
|
+
it 'marks existing records as deleted' do
|
136
134
|
insert_into(:example_table).values(
|
137
135
|
[ :id , :external_id , :external_source , :foo_id , :created_at , :updated_at , :deleted_at , :payload ] ,
|
138
136
|
[ 1 , 'external_id' , external_source , 22 , yesterday , yesterday , nil , 'content' ] ,
|
@@ -140,14 +138,14 @@ module BeetleETL
|
|
140
138
|
|
141
139
|
insert_into(subject.stage_table_name.to_sym).values(
|
142
140
|
[ :id , :external_id , :transition , :external_foo_id , :foo_id , :payload ] ,
|
143
|
-
[ 1 , 'external_id' , '
|
141
|
+
[ 1 , 'external_id' , 'DELETE' , 'foo_id' , 33 , 'updated content' ] ,
|
144
142
|
)
|
145
143
|
|
146
|
-
subject.
|
144
|
+
subject.load_delete
|
147
145
|
|
148
146
|
expect(:example_table).to have_values(
|
149
|
-
[ :id , :external_id , :external_source , :foo_id , :created_at , :updated_at , :deleted_at , :payload
|
150
|
-
[ 1 , 'external_id' , external_source ,
|
147
|
+
[ :id , :external_id , :external_source , :foo_id , :created_at , :updated_at , :deleted_at , :payload ] ,
|
148
|
+
[ 1 , 'external_id' , external_source , 22 , yesterday , now , now , 'content' ] ,
|
151
149
|
)
|
152
150
|
end
|
153
151
|
end
|
@@ -34,6 +34,7 @@ module BeetleETL
|
|
34
34
|
end
|
35
35
|
|
36
36
|
it 'returns all items with met dependencies' do
|
37
|
+
expect(resolver.resolvables([:a])).to match_array([b, c])
|
37
38
|
expect(resolver.resolvables([:a, :b, :c])).to match_array([d])
|
38
39
|
expect(resolver.resolvables([:a, :b, :c, :d])).to match_array([e, f])
|
39
40
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: beetle_etl
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.19
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Luciano Maiwald
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-04-
|
11
|
+
date: 2015-04-26 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: sequel
|