beetle_etl 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (45) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +23 -0
  3. data/.travis.yml +12 -0
  4. data/Gemfile +4 -0
  5. data/LICENSE.txt +22 -0
  6. data/README.md +31 -0
  7. data/Rakefile +5 -0
  8. data/beetle_etl.gemspec +30 -0
  9. data/lib/beetle_etl.rb +85 -0
  10. data/lib/beetle_etl/dsl/dsl.rb +37 -0
  11. data/lib/beetle_etl/dsl/transformation.rb +26 -0
  12. data/lib/beetle_etl/dsl/transformation_loader.rb +22 -0
  13. data/lib/beetle_etl/import.rb +37 -0
  14. data/lib/beetle_etl/state.rb +67 -0
  15. data/lib/beetle_etl/steps/assign_ids.rb +54 -0
  16. data/lib/beetle_etl/steps/load.rb +108 -0
  17. data/lib/beetle_etl/steps/map_relations.rb +31 -0
  18. data/lib/beetle_etl/steps/step.rb +42 -0
  19. data/lib/beetle_etl/steps/table_diff.rb +155 -0
  20. data/lib/beetle_etl/steps/transform.rb +22 -0
  21. data/lib/beetle_etl/task_runner/dependency_resolver.rb +39 -0
  22. data/lib/beetle_etl/task_runner/task_runner.rb +64 -0
  23. data/lib/beetle_etl/version.rb +3 -0
  24. data/script/postgres +12 -0
  25. data/spec/beetle_etl_spec.rb +70 -0
  26. data/spec/dependency_resolver_spec.rb +57 -0
  27. data/spec/dsl/dsl_spec.rb +44 -0
  28. data/spec/dsl/transformation_loader_spec.rb +51 -0
  29. data/spec/dsl/transformation_spec.rb +54 -0
  30. data/spec/feature/example_schema.rb +192 -0
  31. data/spec/feature/example_transform.rb +37 -0
  32. data/spec/feature/feature_spec.rb +48 -0
  33. data/spec/import_spec.rb +7 -0
  34. data/spec/spec_helper.rb +25 -0
  35. data/spec/state_spec.rb +124 -0
  36. data/spec/steps/assign_ids_spec.rb +107 -0
  37. data/spec/steps/load_spec.rb +148 -0
  38. data/spec/steps/map_relations_spec.rb +92 -0
  39. data/spec/steps/step_spec.rb +37 -0
  40. data/spec/steps/table_diff_spec.rb +183 -0
  41. data/spec/steps/transform_spec.rb +34 -0
  42. data/spec/support/database.yml.example +9 -0
  43. data/spec/support/database.yml.travis +4 -0
  44. data/spec/support/database_helpers.rb +58 -0
  45. metadata +220 -0
@@ -0,0 +1,108 @@
1
+ module BeetleETL
2
+ class Load < Step
3
+
4
+ IMPORTER_COLUMNS = %i[
5
+ import_run_id
6
+ external_source
7
+ transition
8
+ ]
9
+
10
+ def run
11
+ %w(create update delete undelete).each do |transition|
12
+ public_send(:"load_#{transition}")
13
+ end
14
+ end
15
+
16
+ def load_create
17
+ just_now = now
18
+ database[table_name].import(
19
+ data_columns + [:external_source, :created_at, :updated_at],
20
+ database[:"#{stage_schema}__#{table_name}"]
21
+ .select(*data_columns)
22
+ .where(
23
+ import_run_id: run_id,
24
+ transition: 'CREATE'
25
+ )
26
+ .select_more(external_source, just_now, just_now)
27
+ )
28
+ end
29
+
30
+ def load_update
31
+ updates = updatable_columns.reduce({updated_at: now}) do |acc, column|
32
+ acc[column] = :"stage__#{column}"
33
+ acc
34
+ end
35
+
36
+ database.from(
37
+ :"#{table_name}___public",
38
+ :"#{stage_schema}__#{table_name}___stage"
39
+ )
40
+ .where(
41
+ stage__id: :public__id,
42
+ stage__transition: 'UPDATE',
43
+ stage__import_run_id: run_id,
44
+ )
45
+ .update(updates)
46
+ end
47
+
48
+ def load_delete
49
+ just_now = now
50
+ database.from(
51
+ :"#{table_name}___public",
52
+ :"#{stage_schema}__#{table_name}___stage"
53
+ )
54
+ .where(
55
+ stage__id: :public__id,
56
+ stage__transition: 'DELETE',
57
+ stage__import_run_id: run_id,
58
+ )
59
+ .update(
60
+ updated_at: just_now,
61
+ deleted_at: just_now,
62
+ )
63
+ end
64
+
65
+ def load_undelete
66
+ updates = updatable_columns.reduce({updated_at: now, deleted_at: nil}) do |acc, column|
67
+ acc[column] = :"stage__#{column}"
68
+ acc
69
+ end
70
+
71
+ database.from(
72
+ :"#{table_name}___public",
73
+ :"#{stage_schema}__#{table_name}___stage"
74
+ )
75
+ .where(
76
+ stage__id: :public__id,
77
+ stage__transition: 'UNDELETE',
78
+ stage__import_run_id: run_id,
79
+ )
80
+ .update(updates)
81
+ end
82
+
83
+ private
84
+
85
+ def data_columns
86
+ table_columns - ignored_columns
87
+ end
88
+
89
+ def table_columns
90
+ @table_columns ||= database[:"#{stage_schema}__#{table_name}"].columns
91
+ end
92
+
93
+ def ignored_columns
94
+ IMPORTER_COLUMNS + table_columns.select do |column_name|
95
+ column_name.to_s.index(/^external_.+_id$/)
96
+ end
97
+ end
98
+
99
+ def updatable_columns
100
+ data_columns - [:id, :external_source, :external_id]
101
+ end
102
+
103
+ def now
104
+ Time.now
105
+ end
106
+
107
+ end
108
+ end
@@ -0,0 +1,31 @@
1
+ module BeetleETL
2
+ class MapRelations < Step
3
+
4
+ attr_reader :relations
5
+
6
+ def initialize(table_name, relations)
7
+ super(table_name)
8
+ @relations = relations
9
+ end
10
+
11
+ def dependencies
12
+ relations.values.map { |d| AssignIds.step_name(d) }.to_set << Transform.step_name(table_name)
13
+ end
14
+
15
+ def run
16
+ relations.each do |foreign_key_column, foreign_table_name|
17
+ database.from(
18
+ :"#{stage_schema}__#{table_name}___ST",
19
+ :"#{stage_schema}__#{foreign_table_name}___FT"
20
+ ).where(
21
+ ST__import_run_id: run_id,
22
+ FT__import_run_id: run_id,
23
+ FT__external_id: :"ST__external_#{foreign_key_column}",
24
+ ).update(
25
+ :"#{foreign_key_column}" => :"FT__id"
26
+ )
27
+ end
28
+ end
29
+
30
+ end
31
+ end
@@ -0,0 +1,42 @@
1
+ module BeetleETL
2
+
3
+ DependenciesNotDefinedError = Class.new(StandardError)
4
+
5
+ class Step
6
+
7
+ attr_reader :table_name
8
+
9
+ def initialize(table_name)
10
+ @table_name = table_name
11
+ end
12
+
13
+ def self.step_name(table_name)
14
+ "#{table_name}: #{name.split('::').last}"
15
+ end
16
+
17
+ def name
18
+ self.class.step_name(table_name)
19
+ end
20
+
21
+ def dependencies
22
+ raise DependenciesNotDefinedError
23
+ end
24
+
25
+ def run_id
26
+ BeetleETL.state.run_id
27
+ end
28
+
29
+ def stage_schema
30
+ BeetleETL.config.stage_schema
31
+ end
32
+
33
+ def external_source
34
+ BeetleETL.config.external_source
35
+ end
36
+
37
+ def database
38
+ BeetleETL.database
39
+ end
40
+
41
+ end
42
+ end
@@ -0,0 +1,155 @@
1
+ module BeetleETL
2
+ class TableDiff < Step
3
+
4
+ IMPORTER_COLUMNS = %i[
5
+ import_run_id
6
+ external_id
7
+ transition
8
+ ]
9
+
10
+ def dependencies
11
+ [MapRelations.step_name(table_name)].to_set
12
+ end
13
+
14
+ def run
15
+ %w(create keep update delete undelete).each do |transition|
16
+ public_send(:"transition_#{transition}")
17
+ end
18
+ end
19
+
20
+ def transition_create
21
+ stage_table.where(
22
+ stage__import_run_id: run_id,
23
+ )
24
+ .where(Sequel.~(public_table.where(
25
+ public__external_id: :stage__external_id,
26
+ public__external_source: external_source,
27
+ )
28
+ .exists))
29
+ .update(transition: 'CREATE')
30
+ end
31
+
32
+ def transition_keep
33
+ stage_table.where(
34
+ stage__import_run_id: run_id,
35
+ )
36
+ .where(
37
+ public_table.where(
38
+ public__external_id: :stage__external_id,
39
+ public__external_source: external_source,
40
+ public__deleted_at: nil,
41
+ )
42
+ .where(
43
+ ':public_columns IS NOT DISTINCT FROM :stage_columns',
44
+ public_columns: public_record_columns,
45
+ stage_columns: stage_record_columns,
46
+ )
47
+ .exists)
48
+ .update(transition: 'KEEP')
49
+ end
50
+
51
+ def transition_update
52
+ stage_table.where(
53
+ stage__import_run_id: run_id,
54
+ )
55
+ .where(
56
+ public_table.where(
57
+ public__external_id: :stage__external_id,
58
+ public__external_source: external_source,
59
+ public__deleted_at: nil,
60
+ )
61
+ .where(
62
+ ':public_columns IS DISTINCT FROM :stage_columns',
63
+ public_columns: public_record_columns,
64
+ stage_columns: stage_record_columns,
65
+ )
66
+ .exists)
67
+ .update(transition: 'UPDATE')
68
+ end
69
+
70
+ def transition_delete
71
+ deleted_dataset = database.from(
72
+ :"#{stage_schema}__#{table_name}___stage",
73
+ ).right_join(
74
+ :"#{table_name}___public",
75
+ public__external_id: :stage__external_id,
76
+ public__external_source: external_source,
77
+ ).where(
78
+ stage__external_id: nil,
79
+ public__deleted_at: nil
80
+ )
81
+
82
+ database[:"#{stage_schema}__#{table_name}"]
83
+ .import(
84
+ [
85
+ :import_run_id,
86
+ :external_id,
87
+ :transition
88
+ ],
89
+ deleted_dataset
90
+ .select(
91
+ run_id,
92
+ :public__external_id,
93
+ 'DELETE'
94
+ )
95
+ )
96
+ end
97
+
98
+ def transition_undelete
99
+ stage_table.where(
100
+ stage__import_run_id: run_id,
101
+ )
102
+ .where(
103
+ public_table.where(
104
+ public__external_id: :stage__external_id,
105
+ public__external_source: external_source,
106
+ )
107
+ .exclude(
108
+ public__deleted_at: nil
109
+ )
110
+ .exists)
111
+ .update(transition: 'UNDELETE')
112
+ end
113
+
114
+ private
115
+
116
+ def stage_table
117
+ @stage_table ||= database[:"#{stage_schema}__#{table_name}___stage"]
118
+ end
119
+
120
+ def public_table
121
+ @public_table ||= database[:"#{table_name}___public"]
122
+ end
123
+
124
+ def public_record_columns
125
+ prefixed_columns(data_columns, 'public')
126
+ end
127
+
128
+ def stage_record_columns
129
+ prefixed_columns(data_columns, 'stage')
130
+ end
131
+
132
+ def data_columns
133
+ table_columns - ignored_columns
134
+ end
135
+
136
+ def table_columns
137
+ @table_columns ||= database[:"#{stage_schema}__#{table_name}"].columns
138
+ end
139
+
140
+ def ignored_columns
141
+ importer_columns + [:id] + table_columns.select do |column_name|
142
+ column_name.to_s.index(/^external_.+_id$/)
143
+ end
144
+ end
145
+
146
+ def importer_columns
147
+ IMPORTER_COLUMNS
148
+ end
149
+
150
+ def prefixed_columns(columns, prefix)
151
+ columns.map { |column| "#{prefix}__#{column}".to_sym }
152
+ end
153
+
154
+ end
155
+ end
@@ -0,0 +1,22 @@
1
+ module BeetleETL
2
+ class Transform < Step
3
+
4
+ attr_reader :query
5
+
6
+ def initialize(table_name, dependencies, query)
7
+ super(table_name)
8
+ @dependencies = dependencies
9
+ @query = query
10
+ end
11
+
12
+ def dependencies
13
+ Set.new(@dependencies.map { |d| self.class.step_name(d) })
14
+ end
15
+
16
+ def run
17
+ database.run(query)
18
+ end
19
+
20
+ end
21
+ end
22
+
@@ -0,0 +1,39 @@
1
+ module BeetleETL
2
+
3
+ UnsatisfiableDependenciesError = Class.new(StandardError)
4
+
5
+ class DependencyResolver
6
+
7
+ def initialize(items)
8
+ @items = items
9
+ check
10
+ end
11
+
12
+ def resolvables(resolved)
13
+ @items.select do |item|
14
+ (item.dependencies.subset?(resolved.to_set) || item.dependencies.empty?) && !resolved.include?(item.name)
15
+ end
16
+ end
17
+
18
+ private
19
+
20
+ def check
21
+ items = @items.dup
22
+ resolved = []
23
+
24
+ while not items.empty?
25
+ resolved_names = resolved.flatten.map(&:name).to_set
26
+
27
+ resolvable = items.select do |item|
28
+ item.dependencies.subset?(resolved_names) || item.dependencies.empty?
29
+ end
30
+
31
+ raise UnsatisfiableDependenciesError if resolvable.empty?
32
+
33
+ resolvable.each { |r| items.delete r }
34
+ resolved << resolvable
35
+ end
36
+ end
37
+
38
+ end
39
+ end
@@ -0,0 +1,64 @@
1
+ require 'celluloid/autostart'
2
+
3
+ module BeetleETL
4
+ class TaskRunner
5
+
6
+ include Celluloid
7
+
8
+ def initialize(runnables)
9
+ @runnables = runnables
10
+ @completed = Set.new
11
+ @running = Set.new
12
+ @dependency_resolver = DependencyResolver.new(runnables)
13
+
14
+ run_next
15
+ end
16
+
17
+ def completed(runnable_name)
18
+ @running.delete(runnable_name)
19
+ @completed << runnable_name
20
+
21
+ run_next
22
+ end
23
+
24
+ def run_next
25
+ if all_run?
26
+ terminate
27
+ else
28
+ resolvables.each do |runnable|
29
+ unless @running.include?(runnable.name)
30
+ Task.new(Actor.current, runnable).async.run_task
31
+ @running << runnable.name
32
+ end
33
+ end
34
+ end
35
+ end
36
+
37
+ private
38
+
39
+ def resolvables
40
+ @dependency_resolver.resolvables(@completed)
41
+ end
42
+
43
+ def all_run?
44
+ @completed == @runnables.map(&:name).to_set
45
+ end
46
+
47
+ class Task
48
+
49
+ include Celluloid
50
+
51
+ def initialize(runner, task)
52
+ @runner = runner
53
+ @task = task
54
+ end
55
+
56
+ def run_task
57
+ @task.run
58
+ @runner.async.completed(@task.name)
59
+ terminate
60
+ end
61
+ end
62
+
63
+ end
64
+ end