beetle_etl 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +23 -0
- data/.travis.yml +12 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +31 -0
- data/Rakefile +5 -0
- data/beetle_etl.gemspec +30 -0
- data/lib/beetle_etl.rb +85 -0
- data/lib/beetle_etl/dsl/dsl.rb +37 -0
- data/lib/beetle_etl/dsl/transformation.rb +26 -0
- data/lib/beetle_etl/dsl/transformation_loader.rb +22 -0
- data/lib/beetle_etl/import.rb +37 -0
- data/lib/beetle_etl/state.rb +67 -0
- data/lib/beetle_etl/steps/assign_ids.rb +54 -0
- data/lib/beetle_etl/steps/load.rb +108 -0
- data/lib/beetle_etl/steps/map_relations.rb +31 -0
- data/lib/beetle_etl/steps/step.rb +42 -0
- data/lib/beetle_etl/steps/table_diff.rb +155 -0
- data/lib/beetle_etl/steps/transform.rb +22 -0
- data/lib/beetle_etl/task_runner/dependency_resolver.rb +39 -0
- data/lib/beetle_etl/task_runner/task_runner.rb +64 -0
- data/lib/beetle_etl/version.rb +3 -0
- data/script/postgres +12 -0
- data/spec/beetle_etl_spec.rb +70 -0
- data/spec/dependency_resolver_spec.rb +57 -0
- data/spec/dsl/dsl_spec.rb +44 -0
- data/spec/dsl/transformation_loader_spec.rb +51 -0
- data/spec/dsl/transformation_spec.rb +54 -0
- data/spec/feature/example_schema.rb +192 -0
- data/spec/feature/example_transform.rb +37 -0
- data/spec/feature/feature_spec.rb +48 -0
- data/spec/import_spec.rb +7 -0
- data/spec/spec_helper.rb +25 -0
- data/spec/state_spec.rb +124 -0
- data/spec/steps/assign_ids_spec.rb +107 -0
- data/spec/steps/load_spec.rb +148 -0
- data/spec/steps/map_relations_spec.rb +92 -0
- data/spec/steps/step_spec.rb +37 -0
- data/spec/steps/table_diff_spec.rb +183 -0
- data/spec/steps/transform_spec.rb +34 -0
- data/spec/support/database.yml.example +9 -0
- data/spec/support/database.yml.travis +4 -0
- data/spec/support/database_helpers.rb +58 -0
- metadata +220 -0
@@ -0,0 +1,108 @@
|
|
1
|
+
module BeetleETL
|
2
|
+
class Load < Step
|
3
|
+
|
4
|
+
IMPORTER_COLUMNS = %i[
|
5
|
+
import_run_id
|
6
|
+
external_source
|
7
|
+
transition
|
8
|
+
]
|
9
|
+
|
10
|
+
def run
|
11
|
+
%w(create update delete undelete).each do |transition|
|
12
|
+
public_send(:"load_#{transition}")
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
def load_create
|
17
|
+
just_now = now
|
18
|
+
database[table_name].import(
|
19
|
+
data_columns + [:external_source, :created_at, :updated_at],
|
20
|
+
database[:"#{stage_schema}__#{table_name}"]
|
21
|
+
.select(*data_columns)
|
22
|
+
.where(
|
23
|
+
import_run_id: run_id,
|
24
|
+
transition: 'CREATE'
|
25
|
+
)
|
26
|
+
.select_more(external_source, just_now, just_now)
|
27
|
+
)
|
28
|
+
end
|
29
|
+
|
30
|
+
def load_update
|
31
|
+
updates = updatable_columns.reduce({updated_at: now}) do |acc, column|
|
32
|
+
acc[column] = :"stage__#{column}"
|
33
|
+
acc
|
34
|
+
end
|
35
|
+
|
36
|
+
database.from(
|
37
|
+
:"#{table_name}___public",
|
38
|
+
:"#{stage_schema}__#{table_name}___stage"
|
39
|
+
)
|
40
|
+
.where(
|
41
|
+
stage__id: :public__id,
|
42
|
+
stage__transition: 'UPDATE',
|
43
|
+
stage__import_run_id: run_id,
|
44
|
+
)
|
45
|
+
.update(updates)
|
46
|
+
end
|
47
|
+
|
48
|
+
def load_delete
|
49
|
+
just_now = now
|
50
|
+
database.from(
|
51
|
+
:"#{table_name}___public",
|
52
|
+
:"#{stage_schema}__#{table_name}___stage"
|
53
|
+
)
|
54
|
+
.where(
|
55
|
+
stage__id: :public__id,
|
56
|
+
stage__transition: 'DELETE',
|
57
|
+
stage__import_run_id: run_id,
|
58
|
+
)
|
59
|
+
.update(
|
60
|
+
updated_at: just_now,
|
61
|
+
deleted_at: just_now,
|
62
|
+
)
|
63
|
+
end
|
64
|
+
|
65
|
+
def load_undelete
|
66
|
+
updates = updatable_columns.reduce({updated_at: now, deleted_at: nil}) do |acc, column|
|
67
|
+
acc[column] = :"stage__#{column}"
|
68
|
+
acc
|
69
|
+
end
|
70
|
+
|
71
|
+
database.from(
|
72
|
+
:"#{table_name}___public",
|
73
|
+
:"#{stage_schema}__#{table_name}___stage"
|
74
|
+
)
|
75
|
+
.where(
|
76
|
+
stage__id: :public__id,
|
77
|
+
stage__transition: 'UNDELETE',
|
78
|
+
stage__import_run_id: run_id,
|
79
|
+
)
|
80
|
+
.update(updates)
|
81
|
+
end
|
82
|
+
|
83
|
+
private
|
84
|
+
|
85
|
+
def data_columns
|
86
|
+
table_columns - ignored_columns
|
87
|
+
end
|
88
|
+
|
89
|
+
def table_columns
|
90
|
+
@table_columns ||= database[:"#{stage_schema}__#{table_name}"].columns
|
91
|
+
end
|
92
|
+
|
93
|
+
def ignored_columns
|
94
|
+
IMPORTER_COLUMNS + table_columns.select do |column_name|
|
95
|
+
column_name.to_s.index(/^external_.+_id$/)
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
def updatable_columns
|
100
|
+
data_columns - [:id, :external_source, :external_id]
|
101
|
+
end
|
102
|
+
|
103
|
+
def now
|
104
|
+
Time.now
|
105
|
+
end
|
106
|
+
|
107
|
+
end
|
108
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
module BeetleETL
|
2
|
+
class MapRelations < Step
|
3
|
+
|
4
|
+
attr_reader :relations
|
5
|
+
|
6
|
+
def initialize(table_name, relations)
|
7
|
+
super(table_name)
|
8
|
+
@relations = relations
|
9
|
+
end
|
10
|
+
|
11
|
+
def dependencies
|
12
|
+
relations.values.map { |d| AssignIds.step_name(d) }.to_set << Transform.step_name(table_name)
|
13
|
+
end
|
14
|
+
|
15
|
+
def run
|
16
|
+
relations.each do |foreign_key_column, foreign_table_name|
|
17
|
+
database.from(
|
18
|
+
:"#{stage_schema}__#{table_name}___ST",
|
19
|
+
:"#{stage_schema}__#{foreign_table_name}___FT"
|
20
|
+
).where(
|
21
|
+
ST__import_run_id: run_id,
|
22
|
+
FT__import_run_id: run_id,
|
23
|
+
FT__external_id: :"ST__external_#{foreign_key_column}",
|
24
|
+
).update(
|
25
|
+
:"#{foreign_key_column}" => :"FT__id"
|
26
|
+
)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
module BeetleETL
|
2
|
+
|
3
|
+
DependenciesNotDefinedError = Class.new(StandardError)
|
4
|
+
|
5
|
+
class Step
|
6
|
+
|
7
|
+
attr_reader :table_name
|
8
|
+
|
9
|
+
def initialize(table_name)
|
10
|
+
@table_name = table_name
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.step_name(table_name)
|
14
|
+
"#{table_name}: #{name.split('::').last}"
|
15
|
+
end
|
16
|
+
|
17
|
+
def name
|
18
|
+
self.class.step_name(table_name)
|
19
|
+
end
|
20
|
+
|
21
|
+
def dependencies
|
22
|
+
raise DependenciesNotDefinedError
|
23
|
+
end
|
24
|
+
|
25
|
+
def run_id
|
26
|
+
BeetleETL.state.run_id
|
27
|
+
end
|
28
|
+
|
29
|
+
def stage_schema
|
30
|
+
BeetleETL.config.stage_schema
|
31
|
+
end
|
32
|
+
|
33
|
+
def external_source
|
34
|
+
BeetleETL.config.external_source
|
35
|
+
end
|
36
|
+
|
37
|
+
def database
|
38
|
+
BeetleETL.database
|
39
|
+
end
|
40
|
+
|
41
|
+
end
|
42
|
+
end
|
@@ -0,0 +1,155 @@
|
|
1
|
+
module BeetleETL
|
2
|
+
class TableDiff < Step
|
3
|
+
|
4
|
+
IMPORTER_COLUMNS = %i[
|
5
|
+
import_run_id
|
6
|
+
external_id
|
7
|
+
transition
|
8
|
+
]
|
9
|
+
|
10
|
+
def dependencies
|
11
|
+
[MapRelations.step_name(table_name)].to_set
|
12
|
+
end
|
13
|
+
|
14
|
+
def run
|
15
|
+
%w(create keep update delete undelete).each do |transition|
|
16
|
+
public_send(:"transition_#{transition}")
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def transition_create
|
21
|
+
stage_table.where(
|
22
|
+
stage__import_run_id: run_id,
|
23
|
+
)
|
24
|
+
.where(Sequel.~(public_table.where(
|
25
|
+
public__external_id: :stage__external_id,
|
26
|
+
public__external_source: external_source,
|
27
|
+
)
|
28
|
+
.exists))
|
29
|
+
.update(transition: 'CREATE')
|
30
|
+
end
|
31
|
+
|
32
|
+
def transition_keep
|
33
|
+
stage_table.where(
|
34
|
+
stage__import_run_id: run_id,
|
35
|
+
)
|
36
|
+
.where(
|
37
|
+
public_table.where(
|
38
|
+
public__external_id: :stage__external_id,
|
39
|
+
public__external_source: external_source,
|
40
|
+
public__deleted_at: nil,
|
41
|
+
)
|
42
|
+
.where(
|
43
|
+
':public_columns IS NOT DISTINCT FROM :stage_columns',
|
44
|
+
public_columns: public_record_columns,
|
45
|
+
stage_columns: stage_record_columns,
|
46
|
+
)
|
47
|
+
.exists)
|
48
|
+
.update(transition: 'KEEP')
|
49
|
+
end
|
50
|
+
|
51
|
+
def transition_update
|
52
|
+
stage_table.where(
|
53
|
+
stage__import_run_id: run_id,
|
54
|
+
)
|
55
|
+
.where(
|
56
|
+
public_table.where(
|
57
|
+
public__external_id: :stage__external_id,
|
58
|
+
public__external_source: external_source,
|
59
|
+
public__deleted_at: nil,
|
60
|
+
)
|
61
|
+
.where(
|
62
|
+
':public_columns IS DISTINCT FROM :stage_columns',
|
63
|
+
public_columns: public_record_columns,
|
64
|
+
stage_columns: stage_record_columns,
|
65
|
+
)
|
66
|
+
.exists)
|
67
|
+
.update(transition: 'UPDATE')
|
68
|
+
end
|
69
|
+
|
70
|
+
def transition_delete
|
71
|
+
deleted_dataset = database.from(
|
72
|
+
:"#{stage_schema}__#{table_name}___stage",
|
73
|
+
).right_join(
|
74
|
+
:"#{table_name}___public",
|
75
|
+
public__external_id: :stage__external_id,
|
76
|
+
public__external_source: external_source,
|
77
|
+
).where(
|
78
|
+
stage__external_id: nil,
|
79
|
+
public__deleted_at: nil
|
80
|
+
)
|
81
|
+
|
82
|
+
database[:"#{stage_schema}__#{table_name}"]
|
83
|
+
.import(
|
84
|
+
[
|
85
|
+
:import_run_id,
|
86
|
+
:external_id,
|
87
|
+
:transition
|
88
|
+
],
|
89
|
+
deleted_dataset
|
90
|
+
.select(
|
91
|
+
run_id,
|
92
|
+
:public__external_id,
|
93
|
+
'DELETE'
|
94
|
+
)
|
95
|
+
)
|
96
|
+
end
|
97
|
+
|
98
|
+
def transition_undelete
|
99
|
+
stage_table.where(
|
100
|
+
stage__import_run_id: run_id,
|
101
|
+
)
|
102
|
+
.where(
|
103
|
+
public_table.where(
|
104
|
+
public__external_id: :stage__external_id,
|
105
|
+
public__external_source: external_source,
|
106
|
+
)
|
107
|
+
.exclude(
|
108
|
+
public__deleted_at: nil
|
109
|
+
)
|
110
|
+
.exists)
|
111
|
+
.update(transition: 'UNDELETE')
|
112
|
+
end
|
113
|
+
|
114
|
+
private
|
115
|
+
|
116
|
+
def stage_table
|
117
|
+
@stage_table ||= database[:"#{stage_schema}__#{table_name}___stage"]
|
118
|
+
end
|
119
|
+
|
120
|
+
def public_table
|
121
|
+
@public_table ||= database[:"#{table_name}___public"]
|
122
|
+
end
|
123
|
+
|
124
|
+
def public_record_columns
|
125
|
+
prefixed_columns(data_columns, 'public')
|
126
|
+
end
|
127
|
+
|
128
|
+
def stage_record_columns
|
129
|
+
prefixed_columns(data_columns, 'stage')
|
130
|
+
end
|
131
|
+
|
132
|
+
def data_columns
|
133
|
+
table_columns - ignored_columns
|
134
|
+
end
|
135
|
+
|
136
|
+
def table_columns
|
137
|
+
@table_columns ||= database[:"#{stage_schema}__#{table_name}"].columns
|
138
|
+
end
|
139
|
+
|
140
|
+
def ignored_columns
|
141
|
+
importer_columns + [:id] + table_columns.select do |column_name|
|
142
|
+
column_name.to_s.index(/^external_.+_id$/)
|
143
|
+
end
|
144
|
+
end
|
145
|
+
|
146
|
+
def importer_columns
|
147
|
+
IMPORTER_COLUMNS
|
148
|
+
end
|
149
|
+
|
150
|
+
def prefixed_columns(columns, prefix)
|
151
|
+
columns.map { |column| "#{prefix}__#{column}".to_sym }
|
152
|
+
end
|
153
|
+
|
154
|
+
end
|
155
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
module BeetleETL
|
2
|
+
class Transform < Step
|
3
|
+
|
4
|
+
attr_reader :query
|
5
|
+
|
6
|
+
def initialize(table_name, dependencies, query)
|
7
|
+
super(table_name)
|
8
|
+
@dependencies = dependencies
|
9
|
+
@query = query
|
10
|
+
end
|
11
|
+
|
12
|
+
def dependencies
|
13
|
+
Set.new(@dependencies.map { |d| self.class.step_name(d) })
|
14
|
+
end
|
15
|
+
|
16
|
+
def run
|
17
|
+
database.run(query)
|
18
|
+
end
|
19
|
+
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
@@ -0,0 +1,39 @@
|
|
1
|
+
module BeetleETL
|
2
|
+
|
3
|
+
UnsatisfiableDependenciesError = Class.new(StandardError)
|
4
|
+
|
5
|
+
class DependencyResolver
|
6
|
+
|
7
|
+
def initialize(items)
|
8
|
+
@items = items
|
9
|
+
check
|
10
|
+
end
|
11
|
+
|
12
|
+
def resolvables(resolved)
|
13
|
+
@items.select do |item|
|
14
|
+
(item.dependencies.subset?(resolved.to_set) || item.dependencies.empty?) && !resolved.include?(item.name)
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
private
|
19
|
+
|
20
|
+
def check
|
21
|
+
items = @items.dup
|
22
|
+
resolved = []
|
23
|
+
|
24
|
+
while not items.empty?
|
25
|
+
resolved_names = resolved.flatten.map(&:name).to_set
|
26
|
+
|
27
|
+
resolvable = items.select do |item|
|
28
|
+
item.dependencies.subset?(resolved_names) || item.dependencies.empty?
|
29
|
+
end
|
30
|
+
|
31
|
+
raise UnsatisfiableDependenciesError if resolvable.empty?
|
32
|
+
|
33
|
+
resolvable.each { |r| items.delete r }
|
34
|
+
resolved << resolvable
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
end
|
39
|
+
end
|
@@ -0,0 +1,64 @@
|
|
1
|
+
require 'celluloid/autostart'
|
2
|
+
|
3
|
+
module BeetleETL
|
4
|
+
class TaskRunner
|
5
|
+
|
6
|
+
include Celluloid
|
7
|
+
|
8
|
+
def initialize(runnables)
|
9
|
+
@runnables = runnables
|
10
|
+
@completed = Set.new
|
11
|
+
@running = Set.new
|
12
|
+
@dependency_resolver = DependencyResolver.new(runnables)
|
13
|
+
|
14
|
+
run_next
|
15
|
+
end
|
16
|
+
|
17
|
+
def completed(runnable_name)
|
18
|
+
@running.delete(runnable_name)
|
19
|
+
@completed << runnable_name
|
20
|
+
|
21
|
+
run_next
|
22
|
+
end
|
23
|
+
|
24
|
+
def run_next
|
25
|
+
if all_run?
|
26
|
+
terminate
|
27
|
+
else
|
28
|
+
resolvables.each do |runnable|
|
29
|
+
unless @running.include?(runnable.name)
|
30
|
+
Task.new(Actor.current, runnable).async.run_task
|
31
|
+
@running << runnable.name
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
private
|
38
|
+
|
39
|
+
def resolvables
|
40
|
+
@dependency_resolver.resolvables(@completed)
|
41
|
+
end
|
42
|
+
|
43
|
+
def all_run?
|
44
|
+
@completed == @runnables.map(&:name).to_set
|
45
|
+
end
|
46
|
+
|
47
|
+
class Task
|
48
|
+
|
49
|
+
include Celluloid
|
50
|
+
|
51
|
+
def initialize(runner, task)
|
52
|
+
@runner = runner
|
53
|
+
@task = task
|
54
|
+
end
|
55
|
+
|
56
|
+
def run_task
|
57
|
+
@task.run
|
58
|
+
@runner.async.completed(@task.name)
|
59
|
+
terminate
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
end
|
64
|
+
end
|