beetle_etl 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +23 -0
- data/.travis.yml +12 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +31 -0
- data/Rakefile +5 -0
- data/beetle_etl.gemspec +30 -0
- data/lib/beetle_etl.rb +85 -0
- data/lib/beetle_etl/dsl/dsl.rb +37 -0
- data/lib/beetle_etl/dsl/transformation.rb +26 -0
- data/lib/beetle_etl/dsl/transformation_loader.rb +22 -0
- data/lib/beetle_etl/import.rb +37 -0
- data/lib/beetle_etl/state.rb +67 -0
- data/lib/beetle_etl/steps/assign_ids.rb +54 -0
- data/lib/beetle_etl/steps/load.rb +108 -0
- data/lib/beetle_etl/steps/map_relations.rb +31 -0
- data/lib/beetle_etl/steps/step.rb +42 -0
- data/lib/beetle_etl/steps/table_diff.rb +155 -0
- data/lib/beetle_etl/steps/transform.rb +22 -0
- data/lib/beetle_etl/task_runner/dependency_resolver.rb +39 -0
- data/lib/beetle_etl/task_runner/task_runner.rb +64 -0
- data/lib/beetle_etl/version.rb +3 -0
- data/script/postgres +12 -0
- data/spec/beetle_etl_spec.rb +70 -0
- data/spec/dependency_resolver_spec.rb +57 -0
- data/spec/dsl/dsl_spec.rb +44 -0
- data/spec/dsl/transformation_loader_spec.rb +51 -0
- data/spec/dsl/transformation_spec.rb +54 -0
- data/spec/feature/example_schema.rb +192 -0
- data/spec/feature/example_transform.rb +37 -0
- data/spec/feature/feature_spec.rb +48 -0
- data/spec/import_spec.rb +7 -0
- data/spec/spec_helper.rb +25 -0
- data/spec/state_spec.rb +124 -0
- data/spec/steps/assign_ids_spec.rb +107 -0
- data/spec/steps/load_spec.rb +148 -0
- data/spec/steps/map_relations_spec.rb +92 -0
- data/spec/steps/step_spec.rb +37 -0
- data/spec/steps/table_diff_spec.rb +183 -0
- data/spec/steps/transform_spec.rb +34 -0
- data/spec/support/database.yml.example +9 -0
- data/spec/support/database.yml.travis +4 -0
- data/spec/support/database_helpers.rb +58 -0
- metadata +220 -0
@@ -0,0 +1,108 @@
|
|
1
|
+
module BeetleETL
|
2
|
+
class Load < Step
|
3
|
+
|
4
|
+
IMPORTER_COLUMNS = %i[
|
5
|
+
import_run_id
|
6
|
+
external_source
|
7
|
+
transition
|
8
|
+
]
|
9
|
+
|
10
|
+
def run
|
11
|
+
%w(create update delete undelete).each do |transition|
|
12
|
+
public_send(:"load_#{transition}")
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
def load_create
|
17
|
+
just_now = now
|
18
|
+
database[table_name].import(
|
19
|
+
data_columns + [:external_source, :created_at, :updated_at],
|
20
|
+
database[:"#{stage_schema}__#{table_name}"]
|
21
|
+
.select(*data_columns)
|
22
|
+
.where(
|
23
|
+
import_run_id: run_id,
|
24
|
+
transition: 'CREATE'
|
25
|
+
)
|
26
|
+
.select_more(external_source, just_now, just_now)
|
27
|
+
)
|
28
|
+
end
|
29
|
+
|
30
|
+
def load_update
|
31
|
+
updates = updatable_columns.reduce({updated_at: now}) do |acc, column|
|
32
|
+
acc[column] = :"stage__#{column}"
|
33
|
+
acc
|
34
|
+
end
|
35
|
+
|
36
|
+
database.from(
|
37
|
+
:"#{table_name}___public",
|
38
|
+
:"#{stage_schema}__#{table_name}___stage"
|
39
|
+
)
|
40
|
+
.where(
|
41
|
+
stage__id: :public__id,
|
42
|
+
stage__transition: 'UPDATE',
|
43
|
+
stage__import_run_id: run_id,
|
44
|
+
)
|
45
|
+
.update(updates)
|
46
|
+
end
|
47
|
+
|
48
|
+
def load_delete
|
49
|
+
just_now = now
|
50
|
+
database.from(
|
51
|
+
:"#{table_name}___public",
|
52
|
+
:"#{stage_schema}__#{table_name}___stage"
|
53
|
+
)
|
54
|
+
.where(
|
55
|
+
stage__id: :public__id,
|
56
|
+
stage__transition: 'DELETE',
|
57
|
+
stage__import_run_id: run_id,
|
58
|
+
)
|
59
|
+
.update(
|
60
|
+
updated_at: just_now,
|
61
|
+
deleted_at: just_now,
|
62
|
+
)
|
63
|
+
end
|
64
|
+
|
65
|
+
def load_undelete
|
66
|
+
updates = updatable_columns.reduce({updated_at: now, deleted_at: nil}) do |acc, column|
|
67
|
+
acc[column] = :"stage__#{column}"
|
68
|
+
acc
|
69
|
+
end
|
70
|
+
|
71
|
+
database.from(
|
72
|
+
:"#{table_name}___public",
|
73
|
+
:"#{stage_schema}__#{table_name}___stage"
|
74
|
+
)
|
75
|
+
.where(
|
76
|
+
stage__id: :public__id,
|
77
|
+
stage__transition: 'UNDELETE',
|
78
|
+
stage__import_run_id: run_id,
|
79
|
+
)
|
80
|
+
.update(updates)
|
81
|
+
end
|
82
|
+
|
83
|
+
private
|
84
|
+
|
85
|
+
def data_columns
|
86
|
+
table_columns - ignored_columns
|
87
|
+
end
|
88
|
+
|
89
|
+
def table_columns
|
90
|
+
@table_columns ||= database[:"#{stage_schema}__#{table_name}"].columns
|
91
|
+
end
|
92
|
+
|
93
|
+
def ignored_columns
|
94
|
+
IMPORTER_COLUMNS + table_columns.select do |column_name|
|
95
|
+
column_name.to_s.index(/^external_.+_id$/)
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
def updatable_columns
|
100
|
+
data_columns - [:id, :external_source, :external_id]
|
101
|
+
end
|
102
|
+
|
103
|
+
def now
|
104
|
+
Time.now
|
105
|
+
end
|
106
|
+
|
107
|
+
end
|
108
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
module BeetleETL
|
2
|
+
class MapRelations < Step
|
3
|
+
|
4
|
+
attr_reader :relations
|
5
|
+
|
6
|
+
def initialize(table_name, relations)
|
7
|
+
super(table_name)
|
8
|
+
@relations = relations
|
9
|
+
end
|
10
|
+
|
11
|
+
def dependencies
|
12
|
+
relations.values.map { |d| AssignIds.step_name(d) }.to_set << Transform.step_name(table_name)
|
13
|
+
end
|
14
|
+
|
15
|
+
def run
|
16
|
+
relations.each do |foreign_key_column, foreign_table_name|
|
17
|
+
database.from(
|
18
|
+
:"#{stage_schema}__#{table_name}___ST",
|
19
|
+
:"#{stage_schema}__#{foreign_table_name}___FT"
|
20
|
+
).where(
|
21
|
+
ST__import_run_id: run_id,
|
22
|
+
FT__import_run_id: run_id,
|
23
|
+
FT__external_id: :"ST__external_#{foreign_key_column}",
|
24
|
+
).update(
|
25
|
+
:"#{foreign_key_column}" => :"FT__id"
|
26
|
+
)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
module BeetleETL
|
2
|
+
|
3
|
+
DependenciesNotDefinedError = Class.new(StandardError)
|
4
|
+
|
5
|
+
class Step
|
6
|
+
|
7
|
+
attr_reader :table_name
|
8
|
+
|
9
|
+
def initialize(table_name)
|
10
|
+
@table_name = table_name
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.step_name(table_name)
|
14
|
+
"#{table_name}: #{name.split('::').last}"
|
15
|
+
end
|
16
|
+
|
17
|
+
def name
|
18
|
+
self.class.step_name(table_name)
|
19
|
+
end
|
20
|
+
|
21
|
+
def dependencies
|
22
|
+
raise DependenciesNotDefinedError
|
23
|
+
end
|
24
|
+
|
25
|
+
def run_id
|
26
|
+
BeetleETL.state.run_id
|
27
|
+
end
|
28
|
+
|
29
|
+
def stage_schema
|
30
|
+
BeetleETL.config.stage_schema
|
31
|
+
end
|
32
|
+
|
33
|
+
def external_source
|
34
|
+
BeetleETL.config.external_source
|
35
|
+
end
|
36
|
+
|
37
|
+
def database
|
38
|
+
BeetleETL.database
|
39
|
+
end
|
40
|
+
|
41
|
+
end
|
42
|
+
end
|
@@ -0,0 +1,155 @@
|
|
1
|
+
module BeetleETL
|
2
|
+
class TableDiff < Step
|
3
|
+
|
4
|
+
IMPORTER_COLUMNS = %i[
|
5
|
+
import_run_id
|
6
|
+
external_id
|
7
|
+
transition
|
8
|
+
]
|
9
|
+
|
10
|
+
def dependencies
|
11
|
+
[MapRelations.step_name(table_name)].to_set
|
12
|
+
end
|
13
|
+
|
14
|
+
def run
|
15
|
+
%w(create keep update delete undelete).each do |transition|
|
16
|
+
public_send(:"transition_#{transition}")
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def transition_create
|
21
|
+
stage_table.where(
|
22
|
+
stage__import_run_id: run_id,
|
23
|
+
)
|
24
|
+
.where(Sequel.~(public_table.where(
|
25
|
+
public__external_id: :stage__external_id,
|
26
|
+
public__external_source: external_source,
|
27
|
+
)
|
28
|
+
.exists))
|
29
|
+
.update(transition: 'CREATE')
|
30
|
+
end
|
31
|
+
|
32
|
+
def transition_keep
|
33
|
+
stage_table.where(
|
34
|
+
stage__import_run_id: run_id,
|
35
|
+
)
|
36
|
+
.where(
|
37
|
+
public_table.where(
|
38
|
+
public__external_id: :stage__external_id,
|
39
|
+
public__external_source: external_source,
|
40
|
+
public__deleted_at: nil,
|
41
|
+
)
|
42
|
+
.where(
|
43
|
+
':public_columns IS NOT DISTINCT FROM :stage_columns',
|
44
|
+
public_columns: public_record_columns,
|
45
|
+
stage_columns: stage_record_columns,
|
46
|
+
)
|
47
|
+
.exists)
|
48
|
+
.update(transition: 'KEEP')
|
49
|
+
end
|
50
|
+
|
51
|
+
def transition_update
|
52
|
+
stage_table.where(
|
53
|
+
stage__import_run_id: run_id,
|
54
|
+
)
|
55
|
+
.where(
|
56
|
+
public_table.where(
|
57
|
+
public__external_id: :stage__external_id,
|
58
|
+
public__external_source: external_source,
|
59
|
+
public__deleted_at: nil,
|
60
|
+
)
|
61
|
+
.where(
|
62
|
+
':public_columns IS DISTINCT FROM :stage_columns',
|
63
|
+
public_columns: public_record_columns,
|
64
|
+
stage_columns: stage_record_columns,
|
65
|
+
)
|
66
|
+
.exists)
|
67
|
+
.update(transition: 'UPDATE')
|
68
|
+
end
|
69
|
+
|
70
|
+
def transition_delete
|
71
|
+
deleted_dataset = database.from(
|
72
|
+
:"#{stage_schema}__#{table_name}___stage",
|
73
|
+
).right_join(
|
74
|
+
:"#{table_name}___public",
|
75
|
+
public__external_id: :stage__external_id,
|
76
|
+
public__external_source: external_source,
|
77
|
+
).where(
|
78
|
+
stage__external_id: nil,
|
79
|
+
public__deleted_at: nil
|
80
|
+
)
|
81
|
+
|
82
|
+
database[:"#{stage_schema}__#{table_name}"]
|
83
|
+
.import(
|
84
|
+
[
|
85
|
+
:import_run_id,
|
86
|
+
:external_id,
|
87
|
+
:transition
|
88
|
+
],
|
89
|
+
deleted_dataset
|
90
|
+
.select(
|
91
|
+
run_id,
|
92
|
+
:public__external_id,
|
93
|
+
'DELETE'
|
94
|
+
)
|
95
|
+
)
|
96
|
+
end
|
97
|
+
|
98
|
+
def transition_undelete
|
99
|
+
stage_table.where(
|
100
|
+
stage__import_run_id: run_id,
|
101
|
+
)
|
102
|
+
.where(
|
103
|
+
public_table.where(
|
104
|
+
public__external_id: :stage__external_id,
|
105
|
+
public__external_source: external_source,
|
106
|
+
)
|
107
|
+
.exclude(
|
108
|
+
public__deleted_at: nil
|
109
|
+
)
|
110
|
+
.exists)
|
111
|
+
.update(transition: 'UNDELETE')
|
112
|
+
end
|
113
|
+
|
114
|
+
private
|
115
|
+
|
116
|
+
def stage_table
|
117
|
+
@stage_table ||= database[:"#{stage_schema}__#{table_name}___stage"]
|
118
|
+
end
|
119
|
+
|
120
|
+
def public_table
|
121
|
+
@public_table ||= database[:"#{table_name}___public"]
|
122
|
+
end
|
123
|
+
|
124
|
+
def public_record_columns
|
125
|
+
prefixed_columns(data_columns, 'public')
|
126
|
+
end
|
127
|
+
|
128
|
+
def stage_record_columns
|
129
|
+
prefixed_columns(data_columns, 'stage')
|
130
|
+
end
|
131
|
+
|
132
|
+
def data_columns
|
133
|
+
table_columns - ignored_columns
|
134
|
+
end
|
135
|
+
|
136
|
+
def table_columns
|
137
|
+
@table_columns ||= database[:"#{stage_schema}__#{table_name}"].columns
|
138
|
+
end
|
139
|
+
|
140
|
+
def ignored_columns
|
141
|
+
importer_columns + [:id] + table_columns.select do |column_name|
|
142
|
+
column_name.to_s.index(/^external_.+_id$/)
|
143
|
+
end
|
144
|
+
end
|
145
|
+
|
146
|
+
def importer_columns
|
147
|
+
IMPORTER_COLUMNS
|
148
|
+
end
|
149
|
+
|
150
|
+
def prefixed_columns(columns, prefix)
|
151
|
+
columns.map { |column| "#{prefix}__#{column}".to_sym }
|
152
|
+
end
|
153
|
+
|
154
|
+
end
|
155
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
module BeetleETL
|
2
|
+
class Transform < Step
|
3
|
+
|
4
|
+
attr_reader :query
|
5
|
+
|
6
|
+
def initialize(table_name, dependencies, query)
|
7
|
+
super(table_name)
|
8
|
+
@dependencies = dependencies
|
9
|
+
@query = query
|
10
|
+
end
|
11
|
+
|
12
|
+
def dependencies
|
13
|
+
Set.new(@dependencies.map { |d| self.class.step_name(d) })
|
14
|
+
end
|
15
|
+
|
16
|
+
def run
|
17
|
+
database.run(query)
|
18
|
+
end
|
19
|
+
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
@@ -0,0 +1,39 @@
|
|
1
|
+
module BeetleETL
|
2
|
+
|
3
|
+
UnsatisfiableDependenciesError = Class.new(StandardError)
|
4
|
+
|
5
|
+
class DependencyResolver
|
6
|
+
|
7
|
+
def initialize(items)
|
8
|
+
@items = items
|
9
|
+
check
|
10
|
+
end
|
11
|
+
|
12
|
+
def resolvables(resolved)
|
13
|
+
@items.select do |item|
|
14
|
+
(item.dependencies.subset?(resolved.to_set) || item.dependencies.empty?) && !resolved.include?(item.name)
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
private
|
19
|
+
|
20
|
+
def check
|
21
|
+
items = @items.dup
|
22
|
+
resolved = []
|
23
|
+
|
24
|
+
while not items.empty?
|
25
|
+
resolved_names = resolved.flatten.map(&:name).to_set
|
26
|
+
|
27
|
+
resolvable = items.select do |item|
|
28
|
+
item.dependencies.subset?(resolved_names) || item.dependencies.empty?
|
29
|
+
end
|
30
|
+
|
31
|
+
raise UnsatisfiableDependenciesError if resolvable.empty?
|
32
|
+
|
33
|
+
resolvable.each { |r| items.delete r }
|
34
|
+
resolved << resolvable
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
end
|
39
|
+
end
|
@@ -0,0 +1,64 @@
|
|
1
|
+
require 'celluloid/autostart'
|
2
|
+
|
3
|
+
module BeetleETL
|
4
|
+
class TaskRunner
|
5
|
+
|
6
|
+
include Celluloid
|
7
|
+
|
8
|
+
def initialize(runnables)
|
9
|
+
@runnables = runnables
|
10
|
+
@completed = Set.new
|
11
|
+
@running = Set.new
|
12
|
+
@dependency_resolver = DependencyResolver.new(runnables)
|
13
|
+
|
14
|
+
run_next
|
15
|
+
end
|
16
|
+
|
17
|
+
def completed(runnable_name)
|
18
|
+
@running.delete(runnable_name)
|
19
|
+
@completed << runnable_name
|
20
|
+
|
21
|
+
run_next
|
22
|
+
end
|
23
|
+
|
24
|
+
def run_next
|
25
|
+
if all_run?
|
26
|
+
terminate
|
27
|
+
else
|
28
|
+
resolvables.each do |runnable|
|
29
|
+
unless @running.include?(runnable.name)
|
30
|
+
Task.new(Actor.current, runnable).async.run_task
|
31
|
+
@running << runnable.name
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
private
|
38
|
+
|
39
|
+
def resolvables
|
40
|
+
@dependency_resolver.resolvables(@completed)
|
41
|
+
end
|
42
|
+
|
43
|
+
def all_run?
|
44
|
+
@completed == @runnables.map(&:name).to_set
|
45
|
+
end
|
46
|
+
|
47
|
+
class Task
|
48
|
+
|
49
|
+
include Celluloid
|
50
|
+
|
51
|
+
def initialize(runner, task)
|
52
|
+
@runner = runner
|
53
|
+
@task = task
|
54
|
+
end
|
55
|
+
|
56
|
+
def run_task
|
57
|
+
@task.run
|
58
|
+
@runner.async.completed(@task.name)
|
59
|
+
terminate
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
end
|
64
|
+
end
|