beetle_etl 0.0.2 → 0.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +6 -0
- data/beetle_etl.gemspec +5 -6
- data/lib/beetle_etl.rb +7 -12
- data/lib/beetle_etl/dsl/dsl.rb +11 -12
- data/lib/beetle_etl/dsl/transformation.rb +10 -3
- data/lib/beetle_etl/dsl/transformation_loader.rb +13 -5
- data/lib/beetle_etl/import.rb +11 -4
- data/lib/beetle_etl/naming.rb +37 -0
- data/lib/beetle_etl/steps/assign_ids.rb +14 -38
- data/lib/beetle_etl/steps/create_stage.rb +59 -0
- data/lib/beetle_etl/steps/drop_stage.rb +15 -0
- data/lib/beetle_etl/steps/load.rb +46 -61
- data/lib/beetle_etl/steps/map_relations.rb +8 -14
- data/lib/beetle_etl/steps/step.rb +1 -8
- data/lib/beetle_etl/steps/table_diff.rb +68 -89
- data/lib/beetle_etl/steps/transform.rb +2 -4
- data/lib/beetle_etl/version.rb +1 -1
- data/spec/beetle_etl_spec.rb +3 -25
- data/spec/dsl/dsl_spec.rb +8 -15
- data/spec/dsl/transformation_loader_spec.rb +11 -4
- data/spec/dsl/transformation_spec.rb +40 -4
- data/spec/feature/example_schema.rb +2 -137
- data/spec/feature/example_transform.rb +13 -6
- data/spec/feature/feature_spec.rb +119 -18
- data/spec/steps/assign_ids_spec.rb +23 -28
- data/spec/steps/create_stage_spec.rb +89 -0
- data/spec/steps/load_spec.rb +15 -23
- data/spec/steps/map_relations_spec.rb +32 -36
- data/spec/steps/table_diff_spec.rb +41 -45
- data/spec/steps/transform_spec.rb +2 -0
- data/spec/{dependency_resolver_spec.rb → task_runner/dependency_resolver_spec.rb} +0 -0
- metadata +22 -36
- data/lib/beetle_etl/state.rb +0 -67
- data/spec/import_spec.rb +0 -7
- data/spec/state_spec.rb +0 -124
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9d792ed373d43d6c0ab7ec27241eaf18fb59736b
|
4
|
+
data.tar.gz: 7b08688ce87fbff7eea72b362773444ec4700b2c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7022ab2cc6a60f57f061d3b1acaa8a125da6db7e9cec76421189ae2fa2c4729b4134cb8246a8c14283aeb7769e1ef92a75f480e2d6be3a3e41ba2e8017b7ee2c
|
7
|
+
data.tar.gz: fa6e2f2f74cf1c4a6a03e53c2d76c962154e8d64f656d32690101bdd4dfa636dda9036213b29dfc675d07e9ed381a6527e94abb462f4f7eabfa3c3f11f279ee1
|
data/Gemfile
CHANGED
data/beetle_etl.gemspec
CHANGED
@@ -18,12 +18,11 @@ Gem::Specification.new do |spec|
|
|
18
18
|
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
19
|
spec.require_paths = ['lib']
|
20
20
|
|
21
|
-
spec.add_runtime_dependency 'sequel', '>= 4.
|
21
|
+
spec.add_runtime_dependency 'sequel', '>= 4.0.0'
|
22
22
|
|
23
23
|
spec.add_development_dependency 'bundler', '~> 1.6'
|
24
|
-
spec.add_development_dependency '
|
25
|
-
spec.add_development_dependency '
|
26
|
-
spec.add_development_dependency 'pg'
|
27
|
-
spec.add_development_dependency '
|
28
|
-
spec.add_development_dependency 'activesupport'
|
24
|
+
spec.add_development_dependency 'rspec', '>= 3.0.0'
|
25
|
+
spec.add_development_dependency 'timecop', '>= 0.7.0'
|
26
|
+
spec.add_development_dependency 'pg', '>= 0.18.0'
|
27
|
+
spec.add_development_dependency 'activesupport', '>= 4.2.0'
|
29
28
|
end
|
data/lib/beetle_etl.rb
CHANGED
@@ -10,17 +10,20 @@ module BeetleETL
|
|
10
10
|
require 'beetle_etl/dsl/transformation'
|
11
11
|
require 'beetle_etl/dsl/transformation_loader'
|
12
12
|
|
13
|
+
require 'beetle_etl/naming'
|
14
|
+
|
13
15
|
require 'beetle_etl/steps/step'
|
16
|
+
require 'beetle_etl/steps/create_stage'
|
14
17
|
require 'beetle_etl/steps/transform'
|
15
18
|
require 'beetle_etl/steps/map_relations'
|
16
19
|
require 'beetle_etl/steps/table_diff'
|
17
20
|
require 'beetle_etl/steps/assign_ids'
|
18
21
|
require 'beetle_etl/steps/load'
|
22
|
+
require 'beetle_etl/steps/drop_stage'
|
19
23
|
|
20
24
|
require 'beetle_etl/task_runner/dependency_resolver'
|
21
25
|
require 'beetle_etl/task_runner/task_runner'
|
22
26
|
|
23
|
-
require 'beetle_etl/state'
|
24
27
|
require 'beetle_etl/import'
|
25
28
|
|
26
29
|
class Configuration
|
@@ -29,23 +32,20 @@ module BeetleETL
|
|
29
32
|
:database,
|
30
33
|
:transformation_file,
|
31
34
|
:stage_schema,
|
35
|
+
:public_schema,
|
32
36
|
:external_source
|
33
37
|
|
34
38
|
def initialize
|
35
|
-
@
|
39
|
+
@public_schema = 'public'
|
36
40
|
end
|
37
41
|
end
|
38
42
|
|
39
43
|
class << self
|
40
44
|
|
41
45
|
def import
|
42
|
-
state.start_import
|
43
|
-
|
44
46
|
begin
|
45
|
-
Import.run
|
46
|
-
state.mark_as_succeeded
|
47
|
+
Import.new.run
|
47
48
|
rescue Exception => e
|
48
|
-
state.mark_as_failed
|
49
49
|
raise e
|
50
50
|
ensure
|
51
51
|
@database.disconnect if @database
|
@@ -71,13 +71,8 @@ module BeetleETL
|
|
71
71
|
end
|
72
72
|
end
|
73
73
|
|
74
|
-
def state
|
75
|
-
@state ||= State.new
|
76
|
-
end
|
77
|
-
|
78
74
|
def reset
|
79
75
|
@config = nil
|
80
|
-
@state = nil
|
81
76
|
@database = nil
|
82
77
|
end
|
83
78
|
|
data/lib/beetle_etl/dsl/dsl.rb
CHANGED
@@ -1,11 +1,17 @@
|
|
1
1
|
module BeetleETL
|
2
2
|
class DSL
|
3
3
|
|
4
|
-
attr_reader :relations, :
|
4
|
+
attr_reader :column_names, :relations, :query_strings
|
5
5
|
|
6
6
|
def initialize(table_name)
|
7
7
|
@table_name = table_name
|
8
|
+
@column_names = []
|
8
9
|
@relations = {}
|
10
|
+
@query_strings = []
|
11
|
+
end
|
12
|
+
|
13
|
+
def columns(*column_names)
|
14
|
+
@column_names = column_names
|
9
15
|
end
|
10
16
|
|
11
17
|
def references(foreign_table, on: foreign_key)
|
@@ -13,25 +19,18 @@ module BeetleETL
|
|
13
19
|
end
|
14
20
|
|
15
21
|
def query(query)
|
16
|
-
@
|
22
|
+
@query_strings << query
|
17
23
|
end
|
18
24
|
|
25
|
+
# query helper methods
|
19
26
|
|
20
|
-
def stage_table
|
21
|
-
|
22
|
-
end
|
23
|
-
|
24
|
-
def external_source
|
25
|
-
'source'
|
27
|
+
def stage_table(table_name = nil)
|
28
|
+
BeetleETL::Naming.stage_table_name_sql(table_name || @table_name)
|
26
29
|
end
|
27
30
|
|
28
31
|
def combined_key(*args)
|
29
32
|
%Q('[' || #{args.join(%q[ || ',' || ])} || ']')
|
30
33
|
end
|
31
34
|
|
32
|
-
def import_run_id
|
33
|
-
BeetleETL.state.run_id
|
34
|
-
end
|
35
|
-
|
36
35
|
end
|
37
36
|
end
|
@@ -5,9 +5,16 @@ module BeetleETL
|
|
5
5
|
|
6
6
|
attr_reader :table_name
|
7
7
|
|
8
|
-
def initialize(table_name, setup)
|
8
|
+
def initialize(table_name, setup, helpers = nil)
|
9
9
|
@table_name = table_name
|
10
|
-
|
10
|
+
@parsed = DSL.new(table_name).tap do |dsl|
|
11
|
+
dsl.instance_eval(&helpers) if helpers
|
12
|
+
dsl.instance_eval(&setup)
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
def column_names
|
17
|
+
@parsed.column_names.map(&:to_sym)
|
11
18
|
end
|
12
19
|
|
13
20
|
def relations
|
@@ -19,7 +26,7 @@ module BeetleETL
|
|
19
26
|
end
|
20
27
|
|
21
28
|
def query
|
22
|
-
@parsed.
|
29
|
+
@parsed.query_strings.join(';')
|
23
30
|
end
|
24
31
|
|
25
32
|
end
|
@@ -1,21 +1,29 @@
|
|
1
1
|
module BeetleETL
|
2
|
-
|
3
|
-
extend self
|
2
|
+
class TransformationLoader
|
4
3
|
|
5
|
-
def
|
4
|
+
def initialize
|
6
5
|
@transformations = []
|
6
|
+
@helper_definitions = nil
|
7
|
+
end
|
7
8
|
|
9
|
+
def load
|
8
10
|
File.open(BeetleETL.config.transformation_file, 'r') do |file|
|
9
11
|
instance_eval file.read
|
10
12
|
end
|
11
13
|
|
12
|
-
@transformations
|
14
|
+
@transformations.map do |(table_name, setup)|
|
15
|
+
Transformation.new(table_name, setup, @helper_definitions)
|
16
|
+
end
|
13
17
|
end
|
14
18
|
|
15
19
|
private
|
16
20
|
|
17
21
|
def import(table_name, &setup)
|
18
|
-
@transformations <<
|
22
|
+
@transformations << [table_name, setup]
|
23
|
+
end
|
24
|
+
|
25
|
+
def helpers(&helper_definitions)
|
26
|
+
@helper_definitions = helper_definitions
|
19
27
|
end
|
20
28
|
|
21
29
|
end
|
data/lib/beetle_etl/import.rb
CHANGED
@@ -1,13 +1,15 @@
|
|
1
1
|
module BeetleETL
|
2
|
-
|
3
|
-
|
4
|
-
extend self
|
2
|
+
class Import
|
5
3
|
|
6
4
|
def run
|
7
5
|
TaskRunner.new(data_steps).run
|
8
6
|
BeetleETL.database.transaction do
|
9
7
|
TaskRunner.new(load_steps).run
|
10
8
|
end
|
9
|
+
rescue => e
|
10
|
+
raise e
|
11
|
+
ensure
|
12
|
+
TaskRunner.new(cleanup_steps).run
|
11
13
|
end
|
12
14
|
|
13
15
|
private
|
@@ -15,6 +17,7 @@ module BeetleETL
|
|
15
17
|
def data_steps
|
16
18
|
transformations.flat_map do |t|
|
17
19
|
[
|
20
|
+
CreateStage.new(t.table_name, t.relations, t.column_names),
|
18
21
|
Transform.new(t.table_name, t.dependencies, t.query),
|
19
22
|
MapRelations.new(t.table_name, t.relations),
|
20
23
|
TableDiff.new(t.table_name),
|
@@ -29,8 +32,12 @@ module BeetleETL
|
|
29
32
|
end
|
30
33
|
end
|
31
34
|
|
35
|
+
def cleanup_steps
|
36
|
+
transformations.map { |t| DropStage.new(t.table_name) }
|
37
|
+
end
|
38
|
+
|
32
39
|
def transformations
|
33
|
-
@transformations ||= TransformationLoader.load
|
40
|
+
@transformations ||= TransformationLoader.new.load
|
34
41
|
end
|
35
42
|
|
36
43
|
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
require 'digest'
|
2
|
+
|
3
|
+
module BeetleETL
|
4
|
+
module Naming
|
5
|
+
|
6
|
+
extend self
|
7
|
+
|
8
|
+
def stage_table_name(table_name = nil)
|
9
|
+
name = (table_name || @table_name).to_s
|
10
|
+
digest = Digest::MD5.hexdigest(name)
|
11
|
+
"#{name}-#{digest}"[0, 63]
|
12
|
+
end
|
13
|
+
|
14
|
+
def stage_table_name_sql(table_name = nil)
|
15
|
+
%Q("#{stage_table_name(table_name)}")
|
16
|
+
end
|
17
|
+
|
18
|
+
def public_table_name(table_name = nil)
|
19
|
+
name = (table_name || @table_name).to_s
|
20
|
+
[public_schema, name].compact.join('.')
|
21
|
+
end
|
22
|
+
|
23
|
+
def public_table_name_sql(table_name = nil)
|
24
|
+
name = (table_name || @table_name).to_s
|
25
|
+
public_table_name= [public_schema, name].compact.join('"."')
|
26
|
+
%Q("#{public_table_name}")
|
27
|
+
end
|
28
|
+
|
29
|
+
private
|
30
|
+
|
31
|
+
def public_schema
|
32
|
+
public_schema = BeetleETL.config.public_schema
|
33
|
+
public_schema != 'public' ? public_schema : nil
|
34
|
+
end
|
35
|
+
|
36
|
+
end
|
37
|
+
end
|
@@ -6,50 +6,26 @@ module BeetleETL
|
|
6
6
|
end
|
7
7
|
|
8
8
|
def run
|
9
|
-
|
10
|
-
|
11
|
-
Thread.new { map_existing_ids }
|
12
|
-
].each(&:join)
|
9
|
+
assign_new_ids
|
10
|
+
map_existing_ids
|
13
11
|
end
|
14
12
|
|
15
13
|
def assign_new_ids
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
.update(
|
22
|
-
id: Sequel.function(:NEXTVAL, "public.#{table_name}_id_seq")
|
23
|
-
)
|
14
|
+
database.execute <<-SQL
|
15
|
+
UPDATE #{stage_table_name_sql}
|
16
|
+
SET id = nextval('#{table_name}_id_seq')
|
17
|
+
WHERE transition = 'CREATE'
|
18
|
+
SQL
|
24
19
|
end
|
25
20
|
|
26
21
|
def map_existing_ids
|
27
|
-
|
28
|
-
|
29
|
-
.
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
.update(id: :public__id)
|
35
|
-
end
|
36
|
-
|
37
|
-
private
|
38
|
-
|
39
|
-
def stage_table_identifier
|
40
|
-
:"#{stage_schema}__#{table_name}___stage"
|
41
|
-
end
|
42
|
-
|
43
|
-
def stage_table
|
44
|
-
database[stage_table_identifier]
|
45
|
-
end
|
46
|
-
|
47
|
-
def public_table_identifier
|
48
|
-
:"#{table_name}___public"
|
49
|
-
end
|
50
|
-
|
51
|
-
def public_table
|
52
|
-
database[public_table_identifier]
|
22
|
+
database.execute <<-SQL
|
23
|
+
UPDATE #{stage_table_name_sql} stage
|
24
|
+
SET id = public.id
|
25
|
+
FROM #{public_table_name_sql} public
|
26
|
+
WHERE stage.transition IN ('KEEP', 'UPDATE', 'DELETE', 'UNDELETE')
|
27
|
+
AND stage.external_id = public.external_id
|
28
|
+
SQL
|
53
29
|
end
|
54
30
|
|
55
31
|
end
|
@@ -0,0 +1,59 @@
|
|
1
|
+
module BeetleETL
|
2
|
+
class CreateStage < Step
|
3
|
+
|
4
|
+
def initialize(table_name, relations, column_names)
|
5
|
+
super(table_name)
|
6
|
+
@relations = relations
|
7
|
+
@column_names = column_names
|
8
|
+
end
|
9
|
+
|
10
|
+
def dependencies
|
11
|
+
Set.new
|
12
|
+
end
|
13
|
+
|
14
|
+
def run
|
15
|
+
database.execute <<-SQL
|
16
|
+
CREATE TABLE #{stage_table_name_sql} (
|
17
|
+
id integer,
|
18
|
+
external_id character varying(255),
|
19
|
+
transition character varying(255),
|
20
|
+
|
21
|
+
#{[
|
22
|
+
payload_column_definitions,
|
23
|
+
relation_column_definitions
|
24
|
+
].compact.join(',')}
|
25
|
+
)
|
26
|
+
SQL
|
27
|
+
end
|
28
|
+
|
29
|
+
private
|
30
|
+
|
31
|
+
def payload_column_definitions
|
32
|
+
definitions = (@column_names - @relations.keys).map do |column_name|
|
33
|
+
"#{column_name} #{column_type(column_name)}"
|
34
|
+
end
|
35
|
+
definitions.join(',') if definitions.any?
|
36
|
+
end
|
37
|
+
|
38
|
+
def relation_column_definitions
|
39
|
+
definitions = @relations.map do |foreign_key_name, table|
|
40
|
+
<<-SQL
|
41
|
+
#{foreign_key_name} integer,
|
42
|
+
external_#{foreign_key_name} character varying(255)
|
43
|
+
SQL
|
44
|
+
end
|
45
|
+
definitions.join(',') if definitions.any?
|
46
|
+
end
|
47
|
+
|
48
|
+
def column_type(column_name)
|
49
|
+
@column_types ||= Hash[database.schema(public_table_name.to_sym)]
|
50
|
+
.reduce({}) do |acc, (name, schema)|
|
51
|
+
acc[name.to_sym] = schema.fetch(:db_type)
|
52
|
+
acc
|
53
|
+
end
|
54
|
+
|
55
|
+
@column_types[column_name]
|
56
|
+
end
|
57
|
+
|
58
|
+
end
|
59
|
+
end
|
@@ -2,93 +2,78 @@ module BeetleETL
|
|
2
2
|
class Load < Step
|
3
3
|
|
4
4
|
IMPORTER_COLUMNS = %i[
|
5
|
-
import_run_id
|
6
5
|
external_source
|
7
6
|
transition
|
8
7
|
]
|
9
8
|
|
10
|
-
attr_reader :relations
|
11
|
-
|
12
9
|
def initialize(table_name, relations)
|
13
10
|
super(table_name)
|
14
11
|
@relations = relations
|
15
12
|
end
|
16
13
|
|
17
14
|
def run
|
18
|
-
%w(create update delete undelete).
|
19
|
-
|
20
|
-
end
|
15
|
+
%w(create update delete undelete).each do |transition|
|
16
|
+
public_send(:"load_#{transition}")
|
17
|
+
end
|
21
18
|
end
|
22
19
|
|
23
20
|
def dependencies
|
24
|
-
relations.values.map { |d| Load.step_name(d) }.to_set
|
21
|
+
@relations.values.map { |d| Load.step_name(d) }.to_set
|
25
22
|
end
|
26
23
|
|
27
24
|
def load_create
|
28
25
|
just_now = now
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
.
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
26
|
+
|
27
|
+
database.execute <<-SQL
|
28
|
+
INSERT INTO #{public_table_name_sql}
|
29
|
+
(#{data_columns.join(', ')}, external_source, created_at, updated_at)
|
30
|
+
SELECT
|
31
|
+
#{data_columns.join(', ')},
|
32
|
+
'#{external_source}',
|
33
|
+
'#{just_now}',
|
34
|
+
'#{just_now}'
|
35
|
+
FROM #{stage_table_name_sql}
|
36
|
+
WHERE transition = 'CREATE'
|
37
|
+
SQL
|
39
38
|
end
|
40
39
|
|
41
40
|
def load_update
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
.where(
|
52
|
-
stage__id: :public__id,
|
53
|
-
stage__transition: 'UPDATE',
|
54
|
-
stage__import_run_id: run_id,
|
55
|
-
)
|
56
|
-
.update(updates)
|
41
|
+
database.execute <<-SQL
|
42
|
+
UPDATE #{public_table_name_sql} public
|
43
|
+
SET
|
44
|
+
#{updatable_columns.map { |c| %Q("#{c}" = stage."#{c}") }.join(',')},
|
45
|
+
"updated_at" = '#{now}'
|
46
|
+
FROM #{stage_table_name_sql} stage
|
47
|
+
WHERE stage.id = public.id
|
48
|
+
AND stage.transition = 'UPDATE'
|
49
|
+
SQL
|
57
50
|
end
|
58
51
|
|
59
52
|
def load_delete
|
60
53
|
just_now = now
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
updated_at: just_now,
|
72
|
-
deleted_at: just_now,
|
73
|
-
)
|
54
|
+
|
55
|
+
database.execute <<-SQL
|
56
|
+
UPDATE #{public_table_name_sql} public
|
57
|
+
SET
|
58
|
+
updated_at = '#{just_now}',
|
59
|
+
deleted_at = '#{just_now}'
|
60
|
+
FROM #{stage_table_name_sql} stage
|
61
|
+
WHERE stage.id = public.id
|
62
|
+
AND stage.transition = 'DELETE'
|
63
|
+
SQL
|
74
64
|
end
|
75
65
|
|
76
66
|
def load_undelete
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
stage__id: :public__id,
|
88
|
-
stage__transition: 'UNDELETE',
|
89
|
-
stage__import_run_id: run_id,
|
90
|
-
)
|
91
|
-
.update(updates)
|
67
|
+
database.execute <<-SQL
|
68
|
+
UPDATE #{public_table_name_sql} public
|
69
|
+
SET
|
70
|
+
#{updatable_columns.map { |c| %Q("#{c}" = stage."#{c}") }.join(',')},
|
71
|
+
updated_at = '#{now}',
|
72
|
+
deleted_at = NULL
|
73
|
+
FROM #{stage_table_name_sql} stage
|
74
|
+
WHERE stage.id = public.id
|
75
|
+
AND stage.transition = 'UNDELETE'
|
76
|
+
SQL
|
92
77
|
end
|
93
78
|
|
94
79
|
private
|
@@ -98,7 +83,7 @@ module BeetleETL
|
|
98
83
|
end
|
99
84
|
|
100
85
|
def table_columns
|
101
|
-
@table_columns ||= database[
|
86
|
+
@table_columns ||= database[stage_table_name.to_sym].columns
|
102
87
|
end
|
103
88
|
|
104
89
|
def ignored_columns
|