beetle_etl 0.0.16 → 0.0.19
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +50 -2
- data/lib/beetle_etl.rb +2 -2
- data/lib/beetle_etl/dsl/transformation.rb +2 -2
- data/lib/beetle_etl/naming.rb +8 -8
- data/lib/beetle_etl/step_runner/dependency_resolver.rb +8 -11
- data/lib/beetle_etl/steps/assign_ids.rb +4 -4
- data/lib/beetle_etl/steps/create_stage.rb +7 -1
- data/lib/beetle_etl/steps/load.rb +9 -21
- data/lib/beetle_etl/steps/map_relations.rb +2 -1
- data/lib/beetle_etl/steps/table_diff.rb +19 -19
- data/lib/beetle_etl/version.rb +1 -1
- data/spec/steps/load_spec.rb +13 -15
- data/spec/task_runner/dependency_resolver_spec.rb +1 -0
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: dacfd8801cf603b9cec442ee9ea025bc14026234
|
4
|
+
data.tar.gz: 1a183e975200bb9bdc6e631130903b6ead9739be
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: bacbe3f6292e52b1ad23d38f5e128b1cf39ec63ded808310a8353956f2b11d349b4ea546012beb22e03edbd3de9bdd787ca0bbde329e1755ebacc23c9561a8ca
|
7
|
+
data.tar.gz: cbc51b9be8c35b70c21829af0c38f6db0898f4b5c38a2fc6138968bc63591e91f51d7558b2fff62196ba43789b3255f2347c9982d78de33a4d025e3da167084d
|
data/README.md
CHANGED
@@ -2,7 +2,9 @@
|
|
2
2
|
[](https://travis-ci.org/maiwald/beetle_etl)
|
3
3
|
[](https://codeclimate.com/github/maiwald/beetle_etl)
|
4
4
|
|
5
|
-
|
5
|
+
BeetleETL helps you with synchronising relational databases and recurring imports of data. It is actually quite nice.
|
6
|
+
|
7
|
+
It currently only works with PostgreSQL databases.
|
6
8
|
|
7
9
|
## Installation
|
8
10
|
|
@@ -20,7 +22,53 @@ Or install it yourself as:
|
|
20
22
|
|
21
23
|
## Usage
|
22
24
|
|
23
|
-
|
25
|
+
### Configuration
|
26
|
+
|
27
|
+
BeetleETL.configure do |config|
|
28
|
+
config.transformation_file = # path to your imports
|
29
|
+
config.database_config = # sequel database config
|
30
|
+
# or config.database = # sequel database instance
|
31
|
+
config.external_source = ‘source_name’
|
32
|
+
config.logger = Logger.new(STDOUT)
|
33
|
+
end
|
34
|
+
|
35
|
+
### Defining Imports
|
36
|
+
|
37
|
+
Fill a file with all the tables you wish to import and write queries to select the data you want.
|
38
|
+
|
39
|
+
import :departments do
|
40
|
+
columns :name
|
41
|
+
|
42
|
+
references :organisations, on: :organisation_id
|
43
|
+
|
44
|
+
query <<-SQL
|
45
|
+
INSERT INTO #{stage_table} (
|
46
|
+
external_id,
|
47
|
+
name,
|
48
|
+
external_organisation_id
|
49
|
+
)
|
50
|
+
|
51
|
+
SELECT
|
52
|
+
o.id,
|
53
|
+
o.”dep_name”,
|
54
|
+
data.”address”
|
55
|
+
|
56
|
+
FROM ”Organisation” o
|
57
|
+
JOIN additional_data data
|
58
|
+
ON data.org_id = o.id
|
59
|
+
SQL
|
60
|
+
end
|
61
|
+
|
62
|
+
|
63
|
+
### Running BeetleETL
|
64
|
+
|
65
|
+
BeetleETL.import
|
66
|
+
|
67
|
+
## Development
|
68
|
+
|
69
|
+
To run the specs call
|
70
|
+
|
71
|
+
$ bundle exec rspec
|
24
72
|
|
25
73
|
## Contributing
|
26
74
|
|
data/lib/beetle_etl.rb
CHANGED
@@ -34,12 +34,12 @@ module BeetleETL
|
|
34
34
|
:database,
|
35
35
|
:transformation_file,
|
36
36
|
:stage_schema,
|
37
|
-
:
|
37
|
+
:target_schema,
|
38
38
|
:external_source,
|
39
39
|
:logger
|
40
40
|
|
41
41
|
def initialize
|
42
|
-
@
|
42
|
+
@target_schema = 'public'
|
43
43
|
@logger = ::Logger.new(STDOUT)
|
44
44
|
end
|
45
45
|
end
|
@@ -8,8 +8,8 @@ module BeetleETL
|
|
8
8
|
def initialize(table_name, setup, helpers = nil)
|
9
9
|
@table_name = table_name
|
10
10
|
@parsed = DSL.new(table_name).tap do |dsl|
|
11
|
-
dsl.
|
12
|
-
dsl.
|
11
|
+
dsl.instance_exec(&helpers) if helpers
|
12
|
+
dsl.instance_exec(&setup)
|
13
13
|
end
|
14
14
|
end
|
15
15
|
|
data/lib/beetle_etl/naming.rb
CHANGED
@@ -15,22 +15,22 @@ module BeetleETL
|
|
15
15
|
%Q("#{stage_table_name(table_name)}")
|
16
16
|
end
|
17
17
|
|
18
|
-
def
|
18
|
+
def target_table_name(table_name = nil)
|
19
19
|
name = (table_name || @table_name).to_s
|
20
|
-
[
|
20
|
+
[target_schema, name].compact.join('.')
|
21
21
|
end
|
22
22
|
|
23
|
-
def
|
23
|
+
def target_table_name_sql(table_name = nil)
|
24
24
|
name = (table_name || @table_name).to_s
|
25
|
-
|
26
|
-
%Q("#{
|
25
|
+
target_table_name= [target_schema, name].compact.join('"."')
|
26
|
+
%Q("#{target_table_name}")
|
27
27
|
end
|
28
28
|
|
29
29
|
private
|
30
30
|
|
31
|
-
def
|
32
|
-
|
33
|
-
|
31
|
+
def target_schema
|
32
|
+
target_schema = BeetleETL.config.target_schema
|
33
|
+
target_schema != 'public' ? target_schema : nil
|
34
34
|
end
|
35
35
|
|
36
36
|
end
|
@@ -11,7 +11,7 @@ module BeetleETL
|
|
11
11
|
|
12
12
|
def resolvables(resolved)
|
13
13
|
@items.select do |item|
|
14
|
-
|
14
|
+
!resolved.include?(item.name) && all_dependencies_met?(item, resolved)
|
15
15
|
end
|
16
16
|
end
|
17
17
|
|
@@ -22,18 +22,15 @@ module BeetleETL
|
|
22
22
|
resolved = []
|
23
23
|
|
24
24
|
until items.empty?
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
item.dependencies.subset?(resolved_names) || item.dependencies.empty?
|
29
|
-
end
|
30
|
-
|
31
|
-
raise UnsatisfiableDependenciesError if resolvable.empty?
|
32
|
-
|
33
|
-
resolvable.each { |r| items.delete r }
|
34
|
-
resolved << resolvable
|
25
|
+
resolvables = items.select { |item| all_dependencies_met?(item, resolved.map(&:name)) }
|
26
|
+
raise UnsatisfiableDependenciesError if resolvables.empty?
|
27
|
+
resolvables.each { |r| resolved << items.delete(r) }
|
35
28
|
end
|
36
29
|
end
|
37
30
|
|
31
|
+
def all_dependencies_met?(item, resolved)
|
32
|
+
item.dependencies.empty? || item.dependencies.subset?(resolved.to_set)
|
33
|
+
end
|
34
|
+
|
38
35
|
end
|
39
36
|
end
|
@@ -8,12 +8,12 @@ module BeetleETL
|
|
8
8
|
def run
|
9
9
|
database.execute <<-SQL
|
10
10
|
UPDATE #{stage_table_name_sql} stage_update
|
11
|
-
SET id = COALESCE(
|
11
|
+
SET id = COALESCE(target.id, nextval('#{table_name}_id_seq'))
|
12
12
|
FROM #{stage_table_name_sql} stage
|
13
|
-
LEFT OUTER JOIN #{
|
13
|
+
LEFT OUTER JOIN #{target_table_name_sql} target
|
14
14
|
on (
|
15
|
-
stage.external_id =
|
16
|
-
AND
|
15
|
+
stage.external_id = target.external_id
|
16
|
+
AND target.external_source = '#{external_source}'
|
17
17
|
)
|
18
18
|
WHERE stage_update.external_id = stage.external_id
|
19
19
|
SQL
|
@@ -23,6 +23,12 @@ module BeetleETL
|
|
23
23
|
|
24
24
|
#{index_definitions};
|
25
25
|
|
26
|
+
ALTER TABLE #{stage_table_name_sql}
|
27
|
+
SET (
|
28
|
+
autovacuum_enabled = false,
|
29
|
+
toast.autovacuum_enabled = false
|
30
|
+
);
|
31
|
+
|
26
32
|
TRUNCATE TABLE #{stage_table_name_sql} RESTART IDENTITY CASCADE;
|
27
33
|
SQL
|
28
34
|
end
|
@@ -70,7 +76,7 @@ module BeetleETL
|
|
70
76
|
end
|
71
77
|
|
72
78
|
def column_type(column_name)
|
73
|
-
@column_types ||= Hash[database.schema(
|
79
|
+
@column_types ||= Hash[database.schema(target_table_name.to_sym)]
|
74
80
|
.reduce({}) do |acc, (name, schema)|
|
75
81
|
acc[name.to_sym] = schema.fetch(:db_type)
|
76
82
|
acc
|
@@ -12,7 +12,7 @@ module BeetleETL
|
|
12
12
|
end
|
13
13
|
|
14
14
|
def run
|
15
|
-
%w(create update delete
|
15
|
+
%w(create update delete).each do |transition|
|
16
16
|
public_send(:"load_#{transition}")
|
17
17
|
end
|
18
18
|
end
|
@@ -25,7 +25,7 @@ module BeetleETL
|
|
25
25
|
just_now = now
|
26
26
|
|
27
27
|
database.execute <<-SQL
|
28
|
-
INSERT INTO #{
|
28
|
+
INSERT INTO #{target_table_name_sql}
|
29
29
|
(#{data_columns.join(', ')}, external_source, created_at, updated_at)
|
30
30
|
SELECT
|
31
31
|
#{data_columns.join(', ')},
|
@@ -39,13 +39,14 @@ module BeetleETL
|
|
39
39
|
|
40
40
|
def load_update
|
41
41
|
database.execute <<-SQL
|
42
|
-
UPDATE #{
|
42
|
+
UPDATE #{target_table_name_sql} target
|
43
43
|
SET
|
44
44
|
#{updatable_columns.map { |c| %Q("#{c}" = stage."#{c}") }.join(',')},
|
45
|
-
"updated_at" = '#{now}'
|
45
|
+
"updated_at" = '#{now}',
|
46
|
+
deleted_at = NULL
|
46
47
|
FROM #{stage_table_name_sql} stage
|
47
|
-
WHERE stage.id =
|
48
|
-
AND stage.transition
|
48
|
+
WHERE stage.id = target.id
|
49
|
+
AND stage.transition IN ('UPDATE', 'REINSTATE')
|
49
50
|
SQL
|
50
51
|
end
|
51
52
|
|
@@ -53,29 +54,16 @@ module BeetleETL
|
|
53
54
|
just_now = now
|
54
55
|
|
55
56
|
database.execute <<-SQL
|
56
|
-
UPDATE #{
|
57
|
+
UPDATE #{target_table_name_sql} target
|
57
58
|
SET
|
58
59
|
updated_at = '#{just_now}',
|
59
60
|
deleted_at = '#{just_now}'
|
60
61
|
FROM #{stage_table_name_sql} stage
|
61
|
-
WHERE stage.id =
|
62
|
+
WHERE stage.id = target.id
|
62
63
|
AND stage.transition = 'DELETE'
|
63
64
|
SQL
|
64
65
|
end
|
65
66
|
|
66
|
-
def load_reinstate
|
67
|
-
database.execute <<-SQL
|
68
|
-
UPDATE #{public_table_name_sql} public
|
69
|
-
SET
|
70
|
-
#{updatable_columns.map { |c| %Q("#{c}" = stage."#{c}") }.join(',')},
|
71
|
-
updated_at = '#{now}',
|
72
|
-
deleted_at = NULL
|
73
|
-
FROM #{stage_table_name_sql} stage
|
74
|
-
WHERE stage.id = public.id
|
75
|
-
AND stage.transition = 'REINSTATE'
|
76
|
-
SQL
|
77
|
-
end
|
78
|
-
|
79
67
|
private
|
80
68
|
|
81
69
|
def data_columns
|
@@ -7,7 +7,8 @@ module BeetleETL
|
|
7
7
|
end
|
8
8
|
|
9
9
|
def dependencies
|
10
|
-
|
10
|
+
result = Set.new([Transform.step_name(table_name)])
|
11
|
+
result.merge @relations.values.map { |d| AssignIds.step_name(d) }
|
11
12
|
end
|
12
13
|
|
13
14
|
def run
|
@@ -22,9 +22,9 @@ module BeetleETL
|
|
22
22
|
SET transition = 'CREATE'
|
23
23
|
WHERE NOT EXISTS (
|
24
24
|
SELECT 1
|
25
|
-
FROM #{
|
26
|
-
WHERE
|
27
|
-
AND
|
25
|
+
FROM #{target_table_name} target
|
26
|
+
WHERE target.external_id = stage.external_id
|
27
|
+
AND target.external_source = '#{external_source}'
|
28
28
|
)
|
29
29
|
SQL
|
30
30
|
end
|
@@ -35,12 +35,12 @@ module BeetleETL
|
|
35
35
|
SET transition = 'UPDATE'
|
36
36
|
WHERE EXISTS (
|
37
37
|
SELECT 1
|
38
|
-
FROM #{
|
39
|
-
WHERE
|
40
|
-
AND
|
41
|
-
AND
|
38
|
+
FROM #{target_table_name} target
|
39
|
+
WHERE target.external_id = stage.external_id
|
40
|
+
AND target.external_source = '#{external_source}'
|
41
|
+
AND target.deleted_at IS NULL
|
42
42
|
AND
|
43
|
-
(#{
|
43
|
+
(#{target_record_columns.join(', ')})
|
44
44
|
IS DISTINCT FROM
|
45
45
|
(#{stage_record_columns.join(', ')})
|
46
46
|
)
|
@@ -52,14 +52,14 @@ module BeetleETL
|
|
52
52
|
INSERT INTO #{stage_table_name_sql}
|
53
53
|
(external_id, transition)
|
54
54
|
SELECT
|
55
|
-
|
55
|
+
target.external_id,
|
56
56
|
'DELETE'
|
57
|
-
FROM #{
|
57
|
+
FROM #{target_table_name_sql} target
|
58
58
|
LEFT OUTER JOIN #{stage_table_name_sql} stage
|
59
|
-
ON (stage.external_id =
|
59
|
+
ON (stage.external_id = target.external_id)
|
60
60
|
WHERE stage.external_id IS NULL
|
61
|
-
AND
|
62
|
-
AND
|
61
|
+
AND target.external_source = '#{external_source}'
|
62
|
+
AND target.deleted_at IS NULL
|
63
63
|
SQL
|
64
64
|
end
|
65
65
|
|
@@ -69,18 +69,18 @@ module BeetleETL
|
|
69
69
|
SET transition = 'REINSTATE'
|
70
70
|
WHERE EXISTS (
|
71
71
|
SELECT 1
|
72
|
-
FROM #{
|
73
|
-
WHERE
|
74
|
-
AND
|
75
|
-
AND
|
72
|
+
FROM #{target_table_name_sql} target
|
73
|
+
WHERE target.external_id = stage.external_id
|
74
|
+
AND target.external_source = '#{external_source}'
|
75
|
+
AND target.deleted_at IS NOT NULL
|
76
76
|
)
|
77
77
|
SQL
|
78
78
|
end
|
79
79
|
|
80
80
|
private
|
81
81
|
|
82
|
-
def
|
83
|
-
prefixed_columns(data_columns, '
|
82
|
+
def target_record_columns
|
83
|
+
prefixed_columns(data_columns, 'target')
|
84
84
|
end
|
85
85
|
|
86
86
|
def stage_record_columns
|
data/lib/beetle_etl/version.rb
CHANGED
data/spec/steps/load_spec.rb
CHANGED
@@ -65,7 +65,7 @@ module BeetleETL
|
|
65
65
|
|
66
66
|
describe '#run' do
|
67
67
|
it 'runs all load steps' do
|
68
|
-
%w(create update delete
|
68
|
+
%w(create update delete).each do |transition|
|
69
69
|
expect(subject).to receive(:"load_#{transition}")
|
70
70
|
end
|
71
71
|
|
@@ -74,7 +74,7 @@ module BeetleETL
|
|
74
74
|
end
|
75
75
|
|
76
76
|
describe '#load_create' do
|
77
|
-
it 'loads records into the
|
77
|
+
it 'loads records into the target table' do
|
78
78
|
insert_into(subject.stage_table_name.to_sym).values(
|
79
79
|
[ :id , :external_id , :transition , :external_foo_id , :foo_id , :payload ] ,
|
80
80
|
[ 3 , 'external_id' , 'CREATE' , 'foo_id' , 22 , 'content' ] ,
|
@@ -108,10 +108,8 @@ module BeetleETL
|
|
108
108
|
[ 1 , 'external_id' , external_source , 33 , yesterday , now , nil , 'updated content' ] ,
|
109
109
|
)
|
110
110
|
end
|
111
|
-
end
|
112
111
|
|
113
|
-
|
114
|
-
it 'marks existing records as deleted' do
|
112
|
+
it 'restores deleted records' do
|
115
113
|
insert_into(:example_table).values(
|
116
114
|
[ :id , :external_id , :external_source , :foo_id , :created_at , :updated_at , :deleted_at , :payload ] ,
|
117
115
|
[ 1 , 'external_id' , external_source , 22 , yesterday , yesterday , nil , 'content' ] ,
|
@@ -119,20 +117,20 @@ module BeetleETL
|
|
119
117
|
|
120
118
|
insert_into(subject.stage_table_name.to_sym).values(
|
121
119
|
[ :id , :external_id , :transition , :external_foo_id , :foo_id , :payload ] ,
|
122
|
-
[ 1 , 'external_id' , '
|
120
|
+
[ 1 , 'external_id' , 'REINSTATE' , 'foo_id' , 33 , 'updated content' ] ,
|
123
121
|
)
|
124
122
|
|
125
|
-
subject.
|
123
|
+
subject.load_update
|
126
124
|
|
127
125
|
expect(:example_table).to have_values(
|
128
|
-
[ :id , :external_id , :external_source , :foo_id , :created_at , :updated_at , :deleted_at , :payload
|
129
|
-
[ 1 , 'external_id' , external_source ,
|
126
|
+
[ :id , :external_id , :external_source , :foo_id , :created_at , :updated_at , :deleted_at , :payload ] ,
|
127
|
+
[ 1 , 'external_id' , external_source , 33 , yesterday , now , nil , 'updated content' ] ,
|
130
128
|
)
|
131
129
|
end
|
132
130
|
end
|
133
131
|
|
134
|
-
describe '#
|
135
|
-
it '
|
132
|
+
describe '#load_delete' do
|
133
|
+
it 'marks existing records as deleted' do
|
136
134
|
insert_into(:example_table).values(
|
137
135
|
[ :id , :external_id , :external_source , :foo_id , :created_at , :updated_at , :deleted_at , :payload ] ,
|
138
136
|
[ 1 , 'external_id' , external_source , 22 , yesterday , yesterday , nil , 'content' ] ,
|
@@ -140,14 +138,14 @@ module BeetleETL
|
|
140
138
|
|
141
139
|
insert_into(subject.stage_table_name.to_sym).values(
|
142
140
|
[ :id , :external_id , :transition , :external_foo_id , :foo_id , :payload ] ,
|
143
|
-
[ 1 , 'external_id' , '
|
141
|
+
[ 1 , 'external_id' , 'DELETE' , 'foo_id' , 33 , 'updated content' ] ,
|
144
142
|
)
|
145
143
|
|
146
|
-
subject.
|
144
|
+
subject.load_delete
|
147
145
|
|
148
146
|
expect(:example_table).to have_values(
|
149
|
-
[ :id , :external_id , :external_source , :foo_id , :created_at , :updated_at , :deleted_at , :payload
|
150
|
-
[ 1 , 'external_id' , external_source ,
|
147
|
+
[ :id , :external_id , :external_source , :foo_id , :created_at , :updated_at , :deleted_at , :payload ] ,
|
148
|
+
[ 1 , 'external_id' , external_source , 22 , yesterday , now , now , 'content' ] ,
|
151
149
|
)
|
152
150
|
end
|
153
151
|
end
|
@@ -34,6 +34,7 @@ module BeetleETL
|
|
34
34
|
end
|
35
35
|
|
36
36
|
it 'returns all items with met dependencies' do
|
37
|
+
expect(resolver.resolvables([:a])).to match_array([b, c])
|
37
38
|
expect(resolver.resolvables([:a, :b, :c])).to match_array([d])
|
38
39
|
expect(resolver.resolvables([:a, :b, :c, :d])).to match_array([e, f])
|
39
40
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: beetle_etl
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.19
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Luciano Maiwald
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-04-
|
11
|
+
date: 2015-04-26 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: sequel
|