beetle_etl 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +23 -0
  3. data/.travis.yml +12 -0
  4. data/Gemfile +4 -0
  5. data/LICENSE.txt +22 -0
  6. data/README.md +31 -0
  7. data/Rakefile +5 -0
  8. data/beetle_etl.gemspec +30 -0
  9. data/lib/beetle_etl.rb +85 -0
  10. data/lib/beetle_etl/dsl/dsl.rb +37 -0
  11. data/lib/beetle_etl/dsl/transformation.rb +26 -0
  12. data/lib/beetle_etl/dsl/transformation_loader.rb +22 -0
  13. data/lib/beetle_etl/import.rb +37 -0
  14. data/lib/beetle_etl/state.rb +67 -0
  15. data/lib/beetle_etl/steps/assign_ids.rb +54 -0
  16. data/lib/beetle_etl/steps/load.rb +108 -0
  17. data/lib/beetle_etl/steps/map_relations.rb +31 -0
  18. data/lib/beetle_etl/steps/step.rb +42 -0
  19. data/lib/beetle_etl/steps/table_diff.rb +155 -0
  20. data/lib/beetle_etl/steps/transform.rb +22 -0
  21. data/lib/beetle_etl/task_runner/dependency_resolver.rb +39 -0
  22. data/lib/beetle_etl/task_runner/task_runner.rb +64 -0
  23. data/lib/beetle_etl/version.rb +3 -0
  24. data/script/postgres +12 -0
  25. data/spec/beetle_etl_spec.rb +70 -0
  26. data/spec/dependency_resolver_spec.rb +57 -0
  27. data/spec/dsl/dsl_spec.rb +44 -0
  28. data/spec/dsl/transformation_loader_spec.rb +51 -0
  29. data/spec/dsl/transformation_spec.rb +54 -0
  30. data/spec/feature/example_schema.rb +192 -0
  31. data/spec/feature/example_transform.rb +37 -0
  32. data/spec/feature/feature_spec.rb +48 -0
  33. data/spec/import_spec.rb +7 -0
  34. data/spec/spec_helper.rb +25 -0
  35. data/spec/state_spec.rb +124 -0
  36. data/spec/steps/assign_ids_spec.rb +107 -0
  37. data/spec/steps/load_spec.rb +148 -0
  38. data/spec/steps/map_relations_spec.rb +92 -0
  39. data/spec/steps/step_spec.rb +37 -0
  40. data/spec/steps/table_diff_spec.rb +183 -0
  41. data/spec/steps/transform_spec.rb +34 -0
  42. data/spec/support/database.yml.example +9 -0
  43. data/spec/support/database.yml.travis +4 -0
  44. data/spec/support/database_helpers.rb +58 -0
  45. metadata +220 -0
@@ -0,0 +1,108 @@
1
+ module BeetleETL
2
+ class Load < Step
3
+
4
+ IMPORTER_COLUMNS = %i[
5
+ import_run_id
6
+ external_source
7
+ transition
8
+ ]
9
+
10
+ def run
11
+ %w(create update delete undelete).each do |transition|
12
+ public_send(:"load_#{transition}")
13
+ end
14
+ end
15
+
16
+ def load_create
17
+ just_now = now
18
+ database[table_name].import(
19
+ data_columns + [:external_source, :created_at, :updated_at],
20
+ database[:"#{stage_schema}__#{table_name}"]
21
+ .select(*data_columns)
22
+ .where(
23
+ import_run_id: run_id,
24
+ transition: 'CREATE'
25
+ )
26
+ .select_more(external_source, just_now, just_now)
27
+ )
28
+ end
29
+
30
+ def load_update
31
+ updates = updatable_columns.reduce({updated_at: now}) do |acc, column|
32
+ acc[column] = :"stage__#{column}"
33
+ acc
34
+ end
35
+
36
+ database.from(
37
+ :"#{table_name}___public",
38
+ :"#{stage_schema}__#{table_name}___stage"
39
+ )
40
+ .where(
41
+ stage__id: :public__id,
42
+ stage__transition: 'UPDATE',
43
+ stage__import_run_id: run_id,
44
+ )
45
+ .update(updates)
46
+ end
47
+
48
+ def load_delete
49
+ just_now = now
50
+ database.from(
51
+ :"#{table_name}___public",
52
+ :"#{stage_schema}__#{table_name}___stage"
53
+ )
54
+ .where(
55
+ stage__id: :public__id,
56
+ stage__transition: 'DELETE',
57
+ stage__import_run_id: run_id,
58
+ )
59
+ .update(
60
+ updated_at: just_now,
61
+ deleted_at: just_now,
62
+ )
63
+ end
64
+
65
+ def load_undelete
66
+ updates = updatable_columns.reduce({updated_at: now, deleted_at: nil}) do |acc, column|
67
+ acc[column] = :"stage__#{column}"
68
+ acc
69
+ end
70
+
71
+ database.from(
72
+ :"#{table_name}___public",
73
+ :"#{stage_schema}__#{table_name}___stage"
74
+ )
75
+ .where(
76
+ stage__id: :public__id,
77
+ stage__transition: 'UNDELETE',
78
+ stage__import_run_id: run_id,
79
+ )
80
+ .update(updates)
81
+ end
82
+
83
+ private
84
+
85
+ def data_columns
86
+ table_columns - ignored_columns
87
+ end
88
+
89
+ def table_columns
90
+ @table_columns ||= database[:"#{stage_schema}__#{table_name}"].columns
91
+ end
92
+
93
+ def ignored_columns
94
+ IMPORTER_COLUMNS + table_columns.select do |column_name|
95
+ column_name.to_s.index(/^external_.+_id$/)
96
+ end
97
+ end
98
+
99
+ def updatable_columns
100
+ data_columns - [:id, :external_source, :external_id]
101
+ end
102
+
103
+ def now
104
+ Time.now
105
+ end
106
+
107
+ end
108
+ end
@@ -0,0 +1,31 @@
1
+ module BeetleETL
2
+ class MapRelations < Step
3
+
4
+ attr_reader :relations
5
+
6
+ def initialize(table_name, relations)
7
+ super(table_name)
8
+ @relations = relations
9
+ end
10
+
11
+ def dependencies
12
+ relations.values.map { |d| AssignIds.step_name(d) }.to_set << Transform.step_name(table_name)
13
+ end
14
+
15
+ def run
16
+ relations.each do |foreign_key_column, foreign_table_name|
17
+ database.from(
18
+ :"#{stage_schema}__#{table_name}___ST",
19
+ :"#{stage_schema}__#{foreign_table_name}___FT"
20
+ ).where(
21
+ ST__import_run_id: run_id,
22
+ FT__import_run_id: run_id,
23
+ FT__external_id: :"ST__external_#{foreign_key_column}",
24
+ ).update(
25
+ :"#{foreign_key_column}" => :"FT__id"
26
+ )
27
+ end
28
+ end
29
+
30
+ end
31
+ end
@@ -0,0 +1,42 @@
1
+ module BeetleETL
2
+
3
+ DependenciesNotDefinedError = Class.new(StandardError)
4
+
5
+ class Step
6
+
7
+ attr_reader :table_name
8
+
9
+ def initialize(table_name)
10
+ @table_name = table_name
11
+ end
12
+
13
+ def self.step_name(table_name)
14
+ "#{table_name}: #{name.split('::').last}"
15
+ end
16
+
17
+ def name
18
+ self.class.step_name(table_name)
19
+ end
20
+
21
+ def dependencies
22
+ raise DependenciesNotDefinedError
23
+ end
24
+
25
+ def run_id
26
+ BeetleETL.state.run_id
27
+ end
28
+
29
+ def stage_schema
30
+ BeetleETL.config.stage_schema
31
+ end
32
+
33
+ def external_source
34
+ BeetleETL.config.external_source
35
+ end
36
+
37
+ def database
38
+ BeetleETL.database
39
+ end
40
+
41
+ end
42
+ end
@@ -0,0 +1,155 @@
1
+ module BeetleETL
2
+ class TableDiff < Step
3
+
4
+ IMPORTER_COLUMNS = %i[
5
+ import_run_id
6
+ external_id
7
+ transition
8
+ ]
9
+
10
+ def dependencies
11
+ [MapRelations.step_name(table_name)].to_set
12
+ end
13
+
14
+ def run
15
+ %w(create keep update delete undelete).each do |transition|
16
+ public_send(:"transition_#{transition}")
17
+ end
18
+ end
19
+
20
+ def transition_create
21
+ stage_table.where(
22
+ stage__import_run_id: run_id,
23
+ )
24
+ .where(Sequel.~(public_table.where(
25
+ public__external_id: :stage__external_id,
26
+ public__external_source: external_source,
27
+ )
28
+ .exists))
29
+ .update(transition: 'CREATE')
30
+ end
31
+
32
+ def transition_keep
33
+ stage_table.where(
34
+ stage__import_run_id: run_id,
35
+ )
36
+ .where(
37
+ public_table.where(
38
+ public__external_id: :stage__external_id,
39
+ public__external_source: external_source,
40
+ public__deleted_at: nil,
41
+ )
42
+ .where(
43
+ ':public_columns IS NOT DISTINCT FROM :stage_columns',
44
+ public_columns: public_record_columns,
45
+ stage_columns: stage_record_columns,
46
+ )
47
+ .exists)
48
+ .update(transition: 'KEEP')
49
+ end
50
+
51
+ def transition_update
52
+ stage_table.where(
53
+ stage__import_run_id: run_id,
54
+ )
55
+ .where(
56
+ public_table.where(
57
+ public__external_id: :stage__external_id,
58
+ public__external_source: external_source,
59
+ public__deleted_at: nil,
60
+ )
61
+ .where(
62
+ ':public_columns IS DISTINCT FROM :stage_columns',
63
+ public_columns: public_record_columns,
64
+ stage_columns: stage_record_columns,
65
+ )
66
+ .exists)
67
+ .update(transition: 'UPDATE')
68
+ end
69
+
70
+ def transition_delete
71
+ deleted_dataset = database.from(
72
+ :"#{stage_schema}__#{table_name}___stage",
73
+ ).right_join(
74
+ :"#{table_name}___public",
75
+ public__external_id: :stage__external_id,
76
+ public__external_source: external_source,
77
+ ).where(
78
+ stage__external_id: nil,
79
+ public__deleted_at: nil
80
+ )
81
+
82
+ database[:"#{stage_schema}__#{table_name}"]
83
+ .import(
84
+ [
85
+ :import_run_id,
86
+ :external_id,
87
+ :transition
88
+ ],
89
+ deleted_dataset
90
+ .select(
91
+ run_id,
92
+ :public__external_id,
93
+ 'DELETE'
94
+ )
95
+ )
96
+ end
97
+
98
+ def transition_undelete
99
+ stage_table.where(
100
+ stage__import_run_id: run_id,
101
+ )
102
+ .where(
103
+ public_table.where(
104
+ public__external_id: :stage__external_id,
105
+ public__external_source: external_source,
106
+ )
107
+ .exclude(
108
+ public__deleted_at: nil
109
+ )
110
+ .exists)
111
+ .update(transition: 'UNDELETE')
112
+ end
113
+
114
+ private
115
+
116
+ def stage_table
117
+ @stage_table ||= database[:"#{stage_schema}__#{table_name}___stage"]
118
+ end
119
+
120
+ def public_table
121
+ @public_table ||= database[:"#{table_name}___public"]
122
+ end
123
+
124
+ def public_record_columns
125
+ prefixed_columns(data_columns, 'public')
126
+ end
127
+
128
+ def stage_record_columns
129
+ prefixed_columns(data_columns, 'stage')
130
+ end
131
+
132
+ def data_columns
133
+ table_columns - ignored_columns
134
+ end
135
+
136
+ def table_columns
137
+ @table_columns ||= database[:"#{stage_schema}__#{table_name}"].columns
138
+ end
139
+
140
+ def ignored_columns
141
+ importer_columns + [:id] + table_columns.select do |column_name|
142
+ column_name.to_s.index(/^external_.+_id$/)
143
+ end
144
+ end
145
+
146
+ def importer_columns
147
+ IMPORTER_COLUMNS
148
+ end
149
+
150
+ def prefixed_columns(columns, prefix)
151
+ columns.map { |column| "#{prefix}__#{column}".to_sym }
152
+ end
153
+
154
+ end
155
+ end
@@ -0,0 +1,22 @@
1
+ module BeetleETL
2
+ class Transform < Step
3
+
4
+ attr_reader :query
5
+
6
+ def initialize(table_name, dependencies, query)
7
+ super(table_name)
8
+ @dependencies = dependencies
9
+ @query = query
10
+ end
11
+
12
+ def dependencies
13
+ Set.new(@dependencies.map { |d| self.class.step_name(d) })
14
+ end
15
+
16
+ def run
17
+ database.run(query)
18
+ end
19
+
20
+ end
21
+ end
22
+
@@ -0,0 +1,39 @@
1
+ module BeetleETL
2
+
3
+ UnsatisfiableDependenciesError = Class.new(StandardError)
4
+
5
+ class DependencyResolver
6
+
7
+ def initialize(items)
8
+ @items = items
9
+ check
10
+ end
11
+
12
+ def resolvables(resolved)
13
+ @items.select do |item|
14
+ (item.dependencies.subset?(resolved.to_set) || item.dependencies.empty?) && !resolved.include?(item.name)
15
+ end
16
+ end
17
+
18
+ private
19
+
20
+ def check
21
+ items = @items.dup
22
+ resolved = []
23
+
24
+ while not items.empty?
25
+ resolved_names = resolved.flatten.map(&:name).to_set
26
+
27
+ resolvable = items.select do |item|
28
+ item.dependencies.subset?(resolved_names) || item.dependencies.empty?
29
+ end
30
+
31
+ raise UnsatisfiableDependenciesError if resolvable.empty?
32
+
33
+ resolvable.each { |r| items.delete r }
34
+ resolved << resolvable
35
+ end
36
+ end
37
+
38
+ end
39
+ end
@@ -0,0 +1,64 @@
1
+ require 'celluloid/autostart'
2
+
3
+ module BeetleETL
4
+ class TaskRunner
5
+
6
+ include Celluloid
7
+
8
+ def initialize(runnables)
9
+ @runnables = runnables
10
+ @completed = Set.new
11
+ @running = Set.new
12
+ @dependency_resolver = DependencyResolver.new(runnables)
13
+
14
+ run_next
15
+ end
16
+
17
+ def completed(runnable_name)
18
+ @running.delete(runnable_name)
19
+ @completed << runnable_name
20
+
21
+ run_next
22
+ end
23
+
24
+ def run_next
25
+ if all_run?
26
+ terminate
27
+ else
28
+ resolvables.each do |runnable|
29
+ unless @running.include?(runnable.name)
30
+ Task.new(Actor.current, runnable).async.run_task
31
+ @running << runnable.name
32
+ end
33
+ end
34
+ end
35
+ end
36
+
37
+ private
38
+
39
+ def resolvables
40
+ @dependency_resolver.resolvables(@completed)
41
+ end
42
+
43
+ def all_run?
44
+ @completed == @runnables.map(&:name).to_set
45
+ end
46
+
47
+ class Task
48
+
49
+ include Celluloid
50
+
51
+ def initialize(runner, task)
52
+ @runner = runner
53
+ @task = task
54
+ end
55
+
56
+ def run_task
57
+ @task.run
58
+ @runner.async.completed(@task.name)
59
+ terminate
60
+ end
61
+ end
62
+
63
+ end
64
+ end