beetle_etl 0.0.2 → 0.0.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile +6 -0
- data/beetle_etl.gemspec +5 -6
- data/lib/beetle_etl.rb +7 -12
- data/lib/beetle_etl/dsl/dsl.rb +11 -12
- data/lib/beetle_etl/dsl/transformation.rb +10 -3
- data/lib/beetle_etl/dsl/transformation_loader.rb +13 -5
- data/lib/beetle_etl/import.rb +11 -4
- data/lib/beetle_etl/naming.rb +37 -0
- data/lib/beetle_etl/steps/assign_ids.rb +14 -38
- data/lib/beetle_etl/steps/create_stage.rb +59 -0
- data/lib/beetle_etl/steps/drop_stage.rb +15 -0
- data/lib/beetle_etl/steps/load.rb +46 -61
- data/lib/beetle_etl/steps/map_relations.rb +8 -14
- data/lib/beetle_etl/steps/step.rb +1 -8
- data/lib/beetle_etl/steps/table_diff.rb +68 -89
- data/lib/beetle_etl/steps/transform.rb +2 -4
- data/lib/beetle_etl/version.rb +1 -1
- data/spec/beetle_etl_spec.rb +3 -25
- data/spec/dsl/dsl_spec.rb +8 -15
- data/spec/dsl/transformation_loader_spec.rb +11 -4
- data/spec/dsl/transformation_spec.rb +40 -4
- data/spec/feature/example_schema.rb +2 -137
- data/spec/feature/example_transform.rb +13 -6
- data/spec/feature/feature_spec.rb +119 -18
- data/spec/steps/assign_ids_spec.rb +23 -28
- data/spec/steps/create_stage_spec.rb +89 -0
- data/spec/steps/load_spec.rb +15 -23
- data/spec/steps/map_relations_spec.rb +32 -36
- data/spec/steps/table_diff_spec.rb +41 -45
- data/spec/steps/transform_spec.rb +2 -0
- data/spec/{dependency_resolver_spec.rb → task_runner/dependency_resolver_spec.rb} +0 -0
- metadata +22 -36
- data/lib/beetle_etl/state.rb +0 -67
- data/spec/import_spec.rb +0 -7
- data/spec/state_spec.rb +0 -124
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9d792ed373d43d6c0ab7ec27241eaf18fb59736b
|
4
|
+
data.tar.gz: 7b08688ce87fbff7eea72b362773444ec4700b2c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7022ab2cc6a60f57f061d3b1acaa8a125da6db7e9cec76421189ae2fa2c4729b4134cb8246a8c14283aeb7769e1ef92a75f480e2d6be3a3e41ba2e8017b7ee2c
|
7
|
+
data.tar.gz: fa6e2f2f74cf1c4a6a03e53c2d76c962154e8d64f656d32690101bdd4dfa636dda9036213b29dfc675d07e9ed381a6527e94abb462f4f7eabfa3c3f11f279ee1
|
data/Gemfile
CHANGED
data/beetle_etl.gemspec
CHANGED
@@ -18,12 +18,11 @@ Gem::Specification.new do |spec|
|
|
18
18
|
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
19
|
spec.require_paths = ['lib']
|
20
20
|
|
21
|
-
spec.add_runtime_dependency 'sequel', '>= 4.
|
21
|
+
spec.add_runtime_dependency 'sequel', '>= 4.0.0'
|
22
22
|
|
23
23
|
spec.add_development_dependency 'bundler', '~> 1.6'
|
24
|
-
spec.add_development_dependency '
|
25
|
-
spec.add_development_dependency '
|
26
|
-
spec.add_development_dependency 'pg'
|
27
|
-
spec.add_development_dependency '
|
28
|
-
spec.add_development_dependency 'activesupport'
|
24
|
+
spec.add_development_dependency 'rspec', '>= 3.0.0'
|
25
|
+
spec.add_development_dependency 'timecop', '>= 0.7.0'
|
26
|
+
spec.add_development_dependency 'pg', '>= 0.18.0'
|
27
|
+
spec.add_development_dependency 'activesupport', '>= 4.2.0'
|
29
28
|
end
|
data/lib/beetle_etl.rb
CHANGED
@@ -10,17 +10,20 @@ module BeetleETL
|
|
10
10
|
require 'beetle_etl/dsl/transformation'
|
11
11
|
require 'beetle_etl/dsl/transformation_loader'
|
12
12
|
|
13
|
+
require 'beetle_etl/naming'
|
14
|
+
|
13
15
|
require 'beetle_etl/steps/step'
|
16
|
+
require 'beetle_etl/steps/create_stage'
|
14
17
|
require 'beetle_etl/steps/transform'
|
15
18
|
require 'beetle_etl/steps/map_relations'
|
16
19
|
require 'beetle_etl/steps/table_diff'
|
17
20
|
require 'beetle_etl/steps/assign_ids'
|
18
21
|
require 'beetle_etl/steps/load'
|
22
|
+
require 'beetle_etl/steps/drop_stage'
|
19
23
|
|
20
24
|
require 'beetle_etl/task_runner/dependency_resolver'
|
21
25
|
require 'beetle_etl/task_runner/task_runner'
|
22
26
|
|
23
|
-
require 'beetle_etl/state'
|
24
27
|
require 'beetle_etl/import'
|
25
28
|
|
26
29
|
class Configuration
|
@@ -29,23 +32,20 @@ module BeetleETL
|
|
29
32
|
:database,
|
30
33
|
:transformation_file,
|
31
34
|
:stage_schema,
|
35
|
+
:public_schema,
|
32
36
|
:external_source
|
33
37
|
|
34
38
|
def initialize
|
35
|
-
@
|
39
|
+
@public_schema = 'public'
|
36
40
|
end
|
37
41
|
end
|
38
42
|
|
39
43
|
class << self
|
40
44
|
|
41
45
|
def import
|
42
|
-
state.start_import
|
43
|
-
|
44
46
|
begin
|
45
|
-
Import.run
|
46
|
-
state.mark_as_succeeded
|
47
|
+
Import.new.run
|
47
48
|
rescue Exception => e
|
48
|
-
state.mark_as_failed
|
49
49
|
raise e
|
50
50
|
ensure
|
51
51
|
@database.disconnect if @database
|
@@ -71,13 +71,8 @@ module BeetleETL
|
|
71
71
|
end
|
72
72
|
end
|
73
73
|
|
74
|
-
def state
|
75
|
-
@state ||= State.new
|
76
|
-
end
|
77
|
-
|
78
74
|
def reset
|
79
75
|
@config = nil
|
80
|
-
@state = nil
|
81
76
|
@database = nil
|
82
77
|
end
|
83
78
|
|
data/lib/beetle_etl/dsl/dsl.rb
CHANGED
@@ -1,11 +1,17 @@
|
|
1
1
|
module BeetleETL
|
2
2
|
class DSL
|
3
3
|
|
4
|
-
attr_reader :relations, :
|
4
|
+
attr_reader :column_names, :relations, :query_strings
|
5
5
|
|
6
6
|
def initialize(table_name)
|
7
7
|
@table_name = table_name
|
8
|
+
@column_names = []
|
8
9
|
@relations = {}
|
10
|
+
@query_strings = []
|
11
|
+
end
|
12
|
+
|
13
|
+
def columns(*column_names)
|
14
|
+
@column_names = column_names
|
9
15
|
end
|
10
16
|
|
11
17
|
def references(foreign_table, on: foreign_key)
|
@@ -13,25 +19,18 @@ module BeetleETL
|
|
13
19
|
end
|
14
20
|
|
15
21
|
def query(query)
|
16
|
-
@
|
22
|
+
@query_strings << query
|
17
23
|
end
|
18
24
|
|
25
|
+
# query helper methods
|
19
26
|
|
20
|
-
def stage_table
|
21
|
-
|
22
|
-
end
|
23
|
-
|
24
|
-
def external_source
|
25
|
-
'source'
|
27
|
+
def stage_table(table_name = nil)
|
28
|
+
BeetleETL::Naming.stage_table_name_sql(table_name || @table_name)
|
26
29
|
end
|
27
30
|
|
28
31
|
def combined_key(*args)
|
29
32
|
%Q('[' || #{args.join(%q[ || ',' || ])} || ']')
|
30
33
|
end
|
31
34
|
|
32
|
-
def import_run_id
|
33
|
-
BeetleETL.state.run_id
|
34
|
-
end
|
35
|
-
|
36
35
|
end
|
37
36
|
end
|
@@ -5,9 +5,16 @@ module BeetleETL
|
|
5
5
|
|
6
6
|
attr_reader :table_name
|
7
7
|
|
8
|
-
def initialize(table_name, setup)
|
8
|
+
def initialize(table_name, setup, helpers = nil)
|
9
9
|
@table_name = table_name
|
10
|
-
|
10
|
+
@parsed = DSL.new(table_name).tap do |dsl|
|
11
|
+
dsl.instance_eval(&helpers) if helpers
|
12
|
+
dsl.instance_eval(&setup)
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
def column_names
|
17
|
+
@parsed.column_names.map(&:to_sym)
|
11
18
|
end
|
12
19
|
|
13
20
|
def relations
|
@@ -19,7 +26,7 @@ module BeetleETL
|
|
19
26
|
end
|
20
27
|
|
21
28
|
def query
|
22
|
-
@parsed.
|
29
|
+
@parsed.query_strings.join(';')
|
23
30
|
end
|
24
31
|
|
25
32
|
end
|
@@ -1,21 +1,29 @@
|
|
1
1
|
module BeetleETL
|
2
|
-
|
3
|
-
extend self
|
2
|
+
class TransformationLoader
|
4
3
|
|
5
|
-
def
|
4
|
+
def initialize
|
6
5
|
@transformations = []
|
6
|
+
@helper_definitions = nil
|
7
|
+
end
|
7
8
|
|
9
|
+
def load
|
8
10
|
File.open(BeetleETL.config.transformation_file, 'r') do |file|
|
9
11
|
instance_eval file.read
|
10
12
|
end
|
11
13
|
|
12
|
-
@transformations
|
14
|
+
@transformations.map do |(table_name, setup)|
|
15
|
+
Transformation.new(table_name, setup, @helper_definitions)
|
16
|
+
end
|
13
17
|
end
|
14
18
|
|
15
19
|
private
|
16
20
|
|
17
21
|
def import(table_name, &setup)
|
18
|
-
@transformations <<
|
22
|
+
@transformations << [table_name, setup]
|
23
|
+
end
|
24
|
+
|
25
|
+
def helpers(&helper_definitions)
|
26
|
+
@helper_definitions = helper_definitions
|
19
27
|
end
|
20
28
|
|
21
29
|
end
|
data/lib/beetle_etl/import.rb
CHANGED
@@ -1,13 +1,15 @@
|
|
1
1
|
module BeetleETL
|
2
|
-
|
3
|
-
|
4
|
-
extend self
|
2
|
+
class Import
|
5
3
|
|
6
4
|
def run
|
7
5
|
TaskRunner.new(data_steps).run
|
8
6
|
BeetleETL.database.transaction do
|
9
7
|
TaskRunner.new(load_steps).run
|
10
8
|
end
|
9
|
+
rescue => e
|
10
|
+
raise e
|
11
|
+
ensure
|
12
|
+
TaskRunner.new(cleanup_steps).run
|
11
13
|
end
|
12
14
|
|
13
15
|
private
|
@@ -15,6 +17,7 @@ module BeetleETL
|
|
15
17
|
def data_steps
|
16
18
|
transformations.flat_map do |t|
|
17
19
|
[
|
20
|
+
CreateStage.new(t.table_name, t.relations, t.column_names),
|
18
21
|
Transform.new(t.table_name, t.dependencies, t.query),
|
19
22
|
MapRelations.new(t.table_name, t.relations),
|
20
23
|
TableDiff.new(t.table_name),
|
@@ -29,8 +32,12 @@ module BeetleETL
|
|
29
32
|
end
|
30
33
|
end
|
31
34
|
|
35
|
+
def cleanup_steps
|
36
|
+
transformations.map { |t| DropStage.new(t.table_name) }
|
37
|
+
end
|
38
|
+
|
32
39
|
def transformations
|
33
|
-
@transformations ||= TransformationLoader.load
|
40
|
+
@transformations ||= TransformationLoader.new.load
|
34
41
|
end
|
35
42
|
|
36
43
|
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
require 'digest'
|
2
|
+
|
3
|
+
module BeetleETL
|
4
|
+
module Naming
|
5
|
+
|
6
|
+
extend self
|
7
|
+
|
8
|
+
def stage_table_name(table_name = nil)
|
9
|
+
name = (table_name || @table_name).to_s
|
10
|
+
digest = Digest::MD5.hexdigest(name)
|
11
|
+
"#{name}-#{digest}"[0, 63]
|
12
|
+
end
|
13
|
+
|
14
|
+
def stage_table_name_sql(table_name = nil)
|
15
|
+
%Q("#{stage_table_name(table_name)}")
|
16
|
+
end
|
17
|
+
|
18
|
+
def public_table_name(table_name = nil)
|
19
|
+
name = (table_name || @table_name).to_s
|
20
|
+
[public_schema, name].compact.join('.')
|
21
|
+
end
|
22
|
+
|
23
|
+
def public_table_name_sql(table_name = nil)
|
24
|
+
name = (table_name || @table_name).to_s
|
25
|
+
public_table_name= [public_schema, name].compact.join('"."')
|
26
|
+
%Q("#{public_table_name}")
|
27
|
+
end
|
28
|
+
|
29
|
+
private
|
30
|
+
|
31
|
+
def public_schema
|
32
|
+
public_schema = BeetleETL.config.public_schema
|
33
|
+
public_schema != 'public' ? public_schema : nil
|
34
|
+
end
|
35
|
+
|
36
|
+
end
|
37
|
+
end
|
@@ -6,50 +6,26 @@ module BeetleETL
|
|
6
6
|
end
|
7
7
|
|
8
8
|
def run
|
9
|
-
|
10
|
-
|
11
|
-
Thread.new { map_existing_ids }
|
12
|
-
].each(&:join)
|
9
|
+
assign_new_ids
|
10
|
+
map_existing_ids
|
13
11
|
end
|
14
12
|
|
15
13
|
def assign_new_ids
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
.update(
|
22
|
-
id: Sequel.function(:NEXTVAL, "public.#{table_name}_id_seq")
|
23
|
-
)
|
14
|
+
database.execute <<-SQL
|
15
|
+
UPDATE #{stage_table_name_sql}
|
16
|
+
SET id = nextval('#{table_name}_id_seq')
|
17
|
+
WHERE transition = 'CREATE'
|
18
|
+
SQL
|
24
19
|
end
|
25
20
|
|
26
21
|
def map_existing_ids
|
27
|
-
|
28
|
-
|
29
|
-
.
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
.update(id: :public__id)
|
35
|
-
end
|
36
|
-
|
37
|
-
private
|
38
|
-
|
39
|
-
def stage_table_identifier
|
40
|
-
:"#{stage_schema}__#{table_name}___stage"
|
41
|
-
end
|
42
|
-
|
43
|
-
def stage_table
|
44
|
-
database[stage_table_identifier]
|
45
|
-
end
|
46
|
-
|
47
|
-
def public_table_identifier
|
48
|
-
:"#{table_name}___public"
|
49
|
-
end
|
50
|
-
|
51
|
-
def public_table
|
52
|
-
database[public_table_identifier]
|
22
|
+
database.execute <<-SQL
|
23
|
+
UPDATE #{stage_table_name_sql} stage
|
24
|
+
SET id = public.id
|
25
|
+
FROM #{public_table_name_sql} public
|
26
|
+
WHERE stage.transition IN ('KEEP', 'UPDATE', 'DELETE', 'UNDELETE')
|
27
|
+
AND stage.external_id = public.external_id
|
28
|
+
SQL
|
53
29
|
end
|
54
30
|
|
55
31
|
end
|
@@ -0,0 +1,59 @@
|
|
1
|
+
module BeetleETL
|
2
|
+
class CreateStage < Step
|
3
|
+
|
4
|
+
def initialize(table_name, relations, column_names)
|
5
|
+
super(table_name)
|
6
|
+
@relations = relations
|
7
|
+
@column_names = column_names
|
8
|
+
end
|
9
|
+
|
10
|
+
def dependencies
|
11
|
+
Set.new
|
12
|
+
end
|
13
|
+
|
14
|
+
def run
|
15
|
+
database.execute <<-SQL
|
16
|
+
CREATE TABLE #{stage_table_name_sql} (
|
17
|
+
id integer,
|
18
|
+
external_id character varying(255),
|
19
|
+
transition character varying(255),
|
20
|
+
|
21
|
+
#{[
|
22
|
+
payload_column_definitions,
|
23
|
+
relation_column_definitions
|
24
|
+
].compact.join(',')}
|
25
|
+
)
|
26
|
+
SQL
|
27
|
+
end
|
28
|
+
|
29
|
+
private
|
30
|
+
|
31
|
+
def payload_column_definitions
|
32
|
+
definitions = (@column_names - @relations.keys).map do |column_name|
|
33
|
+
"#{column_name} #{column_type(column_name)}"
|
34
|
+
end
|
35
|
+
definitions.join(',') if definitions.any?
|
36
|
+
end
|
37
|
+
|
38
|
+
def relation_column_definitions
|
39
|
+
definitions = @relations.map do |foreign_key_name, table|
|
40
|
+
<<-SQL
|
41
|
+
#{foreign_key_name} integer,
|
42
|
+
external_#{foreign_key_name} character varying(255)
|
43
|
+
SQL
|
44
|
+
end
|
45
|
+
definitions.join(',') if definitions.any?
|
46
|
+
end
|
47
|
+
|
48
|
+
def column_type(column_name)
|
49
|
+
@column_types ||= Hash[database.schema(public_table_name.to_sym)]
|
50
|
+
.reduce({}) do |acc, (name, schema)|
|
51
|
+
acc[name.to_sym] = schema.fetch(:db_type)
|
52
|
+
acc
|
53
|
+
end
|
54
|
+
|
55
|
+
@column_types[column_name]
|
56
|
+
end
|
57
|
+
|
58
|
+
end
|
59
|
+
end
|
@@ -2,93 +2,78 @@ module BeetleETL
|
|
2
2
|
class Load < Step
|
3
3
|
|
4
4
|
IMPORTER_COLUMNS = %i[
|
5
|
-
import_run_id
|
6
5
|
external_source
|
7
6
|
transition
|
8
7
|
]
|
9
8
|
|
10
|
-
attr_reader :relations
|
11
|
-
|
12
9
|
def initialize(table_name, relations)
|
13
10
|
super(table_name)
|
14
11
|
@relations = relations
|
15
12
|
end
|
16
13
|
|
17
14
|
def run
|
18
|
-
%w(create update delete undelete).
|
19
|
-
|
20
|
-
end
|
15
|
+
%w(create update delete undelete).each do |transition|
|
16
|
+
public_send(:"load_#{transition}")
|
17
|
+
end
|
21
18
|
end
|
22
19
|
|
23
20
|
def dependencies
|
24
|
-
relations.values.map { |d| Load.step_name(d) }.to_set
|
21
|
+
@relations.values.map { |d| Load.step_name(d) }.to_set
|
25
22
|
end
|
26
23
|
|
27
24
|
def load_create
|
28
25
|
just_now = now
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
.
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
26
|
+
|
27
|
+
database.execute <<-SQL
|
28
|
+
INSERT INTO #{public_table_name_sql}
|
29
|
+
(#{data_columns.join(', ')}, external_source, created_at, updated_at)
|
30
|
+
SELECT
|
31
|
+
#{data_columns.join(', ')},
|
32
|
+
'#{external_source}',
|
33
|
+
'#{just_now}',
|
34
|
+
'#{just_now}'
|
35
|
+
FROM #{stage_table_name_sql}
|
36
|
+
WHERE transition = 'CREATE'
|
37
|
+
SQL
|
39
38
|
end
|
40
39
|
|
41
40
|
def load_update
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
.where(
|
52
|
-
stage__id: :public__id,
|
53
|
-
stage__transition: 'UPDATE',
|
54
|
-
stage__import_run_id: run_id,
|
55
|
-
)
|
56
|
-
.update(updates)
|
41
|
+
database.execute <<-SQL
|
42
|
+
UPDATE #{public_table_name_sql} public
|
43
|
+
SET
|
44
|
+
#{updatable_columns.map { |c| %Q("#{c}" = stage."#{c}") }.join(',')},
|
45
|
+
"updated_at" = '#{now}'
|
46
|
+
FROM #{stage_table_name_sql} stage
|
47
|
+
WHERE stage.id = public.id
|
48
|
+
AND stage.transition = 'UPDATE'
|
49
|
+
SQL
|
57
50
|
end
|
58
51
|
|
59
52
|
def load_delete
|
60
53
|
just_now = now
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
updated_at: just_now,
|
72
|
-
deleted_at: just_now,
|
73
|
-
)
|
54
|
+
|
55
|
+
database.execute <<-SQL
|
56
|
+
UPDATE #{public_table_name_sql} public
|
57
|
+
SET
|
58
|
+
updated_at = '#{just_now}',
|
59
|
+
deleted_at = '#{just_now}'
|
60
|
+
FROM #{stage_table_name_sql} stage
|
61
|
+
WHERE stage.id = public.id
|
62
|
+
AND stage.transition = 'DELETE'
|
63
|
+
SQL
|
74
64
|
end
|
75
65
|
|
76
66
|
def load_undelete
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
stage__id: :public__id,
|
88
|
-
stage__transition: 'UNDELETE',
|
89
|
-
stage__import_run_id: run_id,
|
90
|
-
)
|
91
|
-
.update(updates)
|
67
|
+
database.execute <<-SQL
|
68
|
+
UPDATE #{public_table_name_sql} public
|
69
|
+
SET
|
70
|
+
#{updatable_columns.map { |c| %Q("#{c}" = stage."#{c}") }.join(',')},
|
71
|
+
updated_at = '#{now}',
|
72
|
+
deleted_at = NULL
|
73
|
+
FROM #{stage_table_name_sql} stage
|
74
|
+
WHERE stage.id = public.id
|
75
|
+
AND stage.transition = 'UNDELETE'
|
76
|
+
SQL
|
92
77
|
end
|
93
78
|
|
94
79
|
private
|
@@ -98,7 +83,7 @@ module BeetleETL
|
|
98
83
|
end
|
99
84
|
|
100
85
|
def table_columns
|
101
|
-
@table_columns ||= database[
|
86
|
+
@table_columns ||= database[stage_table_name.to_sym].columns
|
102
87
|
end
|
103
88
|
|
104
89
|
def ignored_columns
|