beetle_etl 0.0.2 → 0.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +6 -0
  3. data/beetle_etl.gemspec +5 -6
  4. data/lib/beetle_etl.rb +7 -12
  5. data/lib/beetle_etl/dsl/dsl.rb +11 -12
  6. data/lib/beetle_etl/dsl/transformation.rb +10 -3
  7. data/lib/beetle_etl/dsl/transformation_loader.rb +13 -5
  8. data/lib/beetle_etl/import.rb +11 -4
  9. data/lib/beetle_etl/naming.rb +37 -0
  10. data/lib/beetle_etl/steps/assign_ids.rb +14 -38
  11. data/lib/beetle_etl/steps/create_stage.rb +59 -0
  12. data/lib/beetle_etl/steps/drop_stage.rb +15 -0
  13. data/lib/beetle_etl/steps/load.rb +46 -61
  14. data/lib/beetle_etl/steps/map_relations.rb +8 -14
  15. data/lib/beetle_etl/steps/step.rb +1 -8
  16. data/lib/beetle_etl/steps/table_diff.rb +68 -89
  17. data/lib/beetle_etl/steps/transform.rb +2 -4
  18. data/lib/beetle_etl/version.rb +1 -1
  19. data/spec/beetle_etl_spec.rb +3 -25
  20. data/spec/dsl/dsl_spec.rb +8 -15
  21. data/spec/dsl/transformation_loader_spec.rb +11 -4
  22. data/spec/dsl/transformation_spec.rb +40 -4
  23. data/spec/feature/example_schema.rb +2 -137
  24. data/spec/feature/example_transform.rb +13 -6
  25. data/spec/feature/feature_spec.rb +119 -18
  26. data/spec/steps/assign_ids_spec.rb +23 -28
  27. data/spec/steps/create_stage_spec.rb +89 -0
  28. data/spec/steps/load_spec.rb +15 -23
  29. data/spec/steps/map_relations_spec.rb +32 -36
  30. data/spec/steps/table_diff_spec.rb +41 -45
  31. data/spec/steps/transform_spec.rb +2 -0
  32. data/spec/{dependency_resolver_spec.rb → task_runner/dependency_resolver_spec.rb} +0 -0
  33. metadata +22 -36
  34. data/lib/beetle_etl/state.rb +0 -67
  35. data/spec/import_spec.rb +0 -7
  36. data/spec/state_spec.rb +0 -124
@@ -1,29 +1,23 @@
1
1
  module BeetleETL
2
2
  class MapRelations < Step
3
3
 
4
- attr_reader :relations
5
-
6
4
  def initialize(table_name, relations)
7
5
  super(table_name)
8
6
  @relations = relations
9
7
  end
10
8
 
11
9
  def dependencies
12
- relations.values.map { |d| AssignIds.step_name(d) }.to_set << Transform.step_name(table_name)
10
+ @relations.values.map { |d| AssignIds.step_name(d) }.to_set << Transform.step_name(table_name)
13
11
  end
14
12
 
15
13
  def run
16
- relations.each do |foreign_key_column, foreign_table_name|
17
- database.from(
18
- :"#{stage_schema}__#{table_name}___ST",
19
- :"#{stage_schema}__#{foreign_table_name}___FT"
20
- ).where(
21
- ST__import_run_id: run_id,
22
- FT__import_run_id: run_id,
23
- FT__external_id: :"ST__external_#{foreign_key_column}",
24
- ).update(
25
- :"#{foreign_key_column}" => :"FT__id"
26
- )
14
+ @relations.map do |foreign_key_column, foreign_table_name|
15
+ database.execute <<-SQL
16
+ UPDATE #{stage_table_name_sql} current_table
17
+ SET #{foreign_key_column} = foreign_table.id
18
+ FROM #{stage_table_name_sql(foreign_table_name)} foreign_table
19
+ WHERE current_table.external_#{foreign_key_column} = foreign_table.external_id
20
+ SQL
27
21
  end
28
22
  end
29
23
 
@@ -4,6 +4,7 @@ module BeetleETL
4
4
 
5
5
  class Step
6
6
 
7
+ include BeetleETL::Naming
7
8
  attr_reader :table_name
8
9
 
9
10
  def initialize(table_name)
@@ -22,14 +23,6 @@ module BeetleETL
22
23
  raise DependenciesNotDefinedError
23
24
  end
24
25
 
25
- def run_id
26
- BeetleETL.state.run_id
27
- end
28
-
29
- def stage_schema
30
- BeetleETL.config.stage_schema
31
- end
32
-
33
26
  def external_source
34
27
  BeetleETL.config.external_source
35
28
  end
@@ -2,7 +2,6 @@ module BeetleETL
2
2
  class TableDiff < Step
3
3
 
4
4
  IMPORTER_COLUMNS = %i[
5
- import_run_id
6
5
  external_id
7
6
  transition
8
7
  ]
@@ -12,115 +11,95 @@ module BeetleETL
12
11
  end
13
12
 
14
13
  def run
15
- %w(create keep update delete undelete).map do |transition|
16
- Thread.new { public_send(:"transition_#{transition}") }
17
- end.each(&:join)
14
+ %w(create keep update delete undelete).each do |transition|
15
+ public_send(:"transition_#{transition}")
16
+ end
18
17
  end
19
18
 
20
19
  def transition_create
21
- stage_table.where(
22
- stage__import_run_id: run_id,
23
- )
24
- .where(Sequel.~(public_table.where(
25
- public__external_id: :stage__external_id,
26
- public__external_source: external_source,
20
+ database.execute <<-SQL
21
+ UPDATE #{stage_table_name_sql} stage
22
+ SET transition = 'CREATE'
23
+ WHERE NOT EXISTS (
24
+ SELECT 1
25
+ FROM #{public_table_name} public
26
+ WHERE public.external_id = stage.external_id
27
+ AND public.external_source = '#{external_source}'
27
28
  )
28
- .exists))
29
- .update(transition: 'CREATE')
29
+ SQL
30
30
  end
31
31
 
32
32
  def transition_keep
33
- stage_table.where(
34
- stage__import_run_id: run_id,
35
- )
36
- .where(
37
- public_table.where(
38
- public__external_id: :stage__external_id,
39
- public__external_source: external_source,
40
- public__deleted_at: nil,
41
- )
42
- .where(
43
- ':public_columns IS NOT DISTINCT FROM :stage_columns',
44
- public_columns: public_record_columns,
45
- stage_columns: stage_record_columns,
33
+ database.execute <<-SQL
34
+ UPDATE #{stage_table_name_sql} stage
35
+ SET transition = 'KEEP'
36
+ WHERE EXISTS (
37
+ SELECT 1
38
+ FROM #{public_table_name} public
39
+ WHERE public.external_id = stage.external_id
40
+ AND public.external_source = '#{external_source}'
41
+ AND public.deleted_at IS NULL
42
+ AND
43
+ (#{public_record_columns.join(', ')})
44
+ IS NOT DISTINCT FROM
45
+ (#{stage_record_columns.join(', ')})
46
46
  )
47
- .exists)
48
- .update(transition: 'KEEP')
47
+ SQL
49
48
  end
50
49
 
51
50
  def transition_update
52
- stage_table.where(
53
- stage__import_run_id: run_id,
54
- )
55
- .where(
56
- public_table.where(
57
- public__external_id: :stage__external_id,
58
- public__external_source: external_source,
59
- public__deleted_at: nil,
60
- )
61
- .where(
62
- ':public_columns IS DISTINCT FROM :stage_columns',
63
- public_columns: public_record_columns,
64
- stage_columns: stage_record_columns,
51
+ database.execute <<-SQL
52
+ UPDATE #{stage_table_name_sql} stage
53
+ SET transition = 'UPDATE'
54
+ WHERE EXISTS (
55
+ SELECT 1
56
+ FROM #{public_table_name} public
57
+ WHERE public.external_id = stage.external_id
58
+ AND public.external_source = '#{external_source}'
59
+ AND public.deleted_at IS NULL
60
+ AND
61
+ (#{public_record_columns.join(', ')})
62
+ IS DISTINCT FROM
63
+ (#{stage_record_columns.join(', ')})
65
64
  )
66
- .exists)
67
- .update(transition: 'UPDATE')
65
+ SQL
68
66
  end
69
67
 
70
68
  def transition_delete
71
- deleted_dataset = database.from(
72
- :"#{stage_schema}__#{table_name}___stage",
73
- ).right_join(
74
- :"#{table_name}___public",
75
- public__external_id: :stage__external_id,
76
- public__external_source: external_source,
77
- ).where(
78
- stage__external_id: nil,
79
- public__deleted_at: nil
80
- )
81
-
82
- database[:"#{stage_schema}__#{table_name}"]
83
- .import(
84
- [
85
- :import_run_id,
86
- :external_id,
87
- :transition
88
- ],
89
- deleted_dataset
90
- .select(
91
- run_id,
92
- :public__external_id,
93
- 'DELETE'
94
- )
95
- )
69
+ database.execute <<-SQL
70
+ INSERT INTO #{stage_table_name_sql}
71
+ (external_id, transition)
72
+ SELECT
73
+ public.external_id,
74
+ 'DELETE'
75
+ FROM #{public_table_name_sql} public
76
+ LEFT OUTER JOIN (
77
+ SELECT *
78
+ FROM #{stage_table_name_sql}
79
+ ) stage
80
+ ON (stage.external_id = public.external_id)
81
+ WHERE stage.external_id IS NULL
82
+ AND public.external_source = '#{external_source}'
83
+ AND public.deleted_at IS NULL
84
+ SQL
96
85
  end
97
86
 
98
87
  def transition_undelete
99
- stage_table.where(
100
- stage__import_run_id: run_id,
101
- )
102
- .where(
103
- public_table.where(
104
- public__external_id: :stage__external_id,
105
- public__external_source: external_source,
106
- )
107
- .exclude(
108
- public__deleted_at: nil
88
+ database.execute <<-SQL
89
+ UPDATE #{stage_table_name_sql} stage
90
+ SET transition = 'UNDELETE'
91
+ WHERE EXISTS (
92
+ SELECT 1
93
+ FROM #{public_table_name_sql} public
94
+ WHERE public.external_id = stage.external_id
95
+ AND public.external_source = '#{external_source}'
96
+ AND public.deleted_at IS NOT NULL
109
97
  )
110
- .exists)
111
- .update(transition: 'UNDELETE')
98
+ SQL
112
99
  end
113
100
 
114
101
  private
115
102
 
116
- def stage_table
117
- @stage_table ||= database[:"#{stage_schema}__#{table_name}___stage"]
118
- end
119
-
120
- def public_table
121
- @public_table ||= database[:"#{table_name}___public"]
122
- end
123
-
124
103
  def public_record_columns
125
104
  prefixed_columns(data_columns, 'public')
126
105
  end
@@ -134,7 +113,7 @@ module BeetleETL
134
113
  end
135
114
 
136
115
  def table_columns
137
- @table_columns ||= database[:"#{stage_schema}__#{table_name}"].columns
116
+ @table_columns ||= database[stage_table_name.to_sym].columns
138
117
  end
139
118
 
140
119
  def ignored_columns
@@ -148,7 +127,7 @@ module BeetleETL
148
127
  end
149
128
 
150
129
  def prefixed_columns(columns, prefix)
151
- columns.map { |column| "#{prefix}__#{column}".to_sym }
130
+ columns.map { |column| %Q("#{prefix}"."#{column}") }
152
131
  end
153
132
 
154
133
  end
@@ -1,8 +1,6 @@
1
1
  module BeetleETL
2
2
  class Transform < Step
3
3
 
4
- attr_reader :query
5
-
6
4
  def initialize(table_name, dependencies, query)
7
5
  super(table_name)
8
6
  @dependencies = dependencies
@@ -10,11 +8,11 @@ module BeetleETL
10
8
  end
11
9
 
12
10
  def dependencies
13
- Set.new(@dependencies.map { |d| self.class.step_name(d) })
11
+ Set.new(@dependencies.map { |d| self.class.step_name(d) }) << CreateStage.step_name(table_name)
14
12
  end
15
13
 
16
14
  def run
17
- database.run(query)
15
+ database.run(@query)
18
16
  end
19
17
 
20
18
  end
@@ -1,3 +1,3 @@
1
1
  module BeetleETL
2
- VERSION = "0.0.2"
2
+ VERSION = "0.0.7"
3
3
  end
@@ -1,34 +1,12 @@
1
1
  require 'spec_helper'
2
2
 
3
3
  describe BeetleETL do
4
- describe '#import' do
5
4
 
5
+ describe '#import' do
6
6
  it 'runs the import' do
7
- allow(BeetleETL).to receive(:state) { double(:state).as_null_object }
8
- expect(BeetleETL::Import).to receive(:run)
7
+ expect(BeetleETL::Import).to receive_message_chain(:new, :run)
9
8
  BeetleETL.import
10
9
  end
11
-
12
- context 'handling state' do
13
- it 'starts the import and marks it as finished if no errors are thrown' do
14
- allow(BeetleETL::Import).to receive(:run)
15
-
16
- expect(BeetleETL.state).to receive(:start_import).ordered
17
- expect(BeetleETL.state).to receive(:mark_as_succeeded).ordered
18
-
19
- BeetleETL.import
20
- end
21
-
22
- it 'starts the import and marks it as failed if Import.run throws an error' do
23
- exception = Exception.new
24
- allow(BeetleETL::Import).to receive(:run).and_raise(exception)
25
-
26
- expect(BeetleETL.state).to receive(:start_import).ordered
27
- expect(BeetleETL.state).to receive(:mark_as_failed).ordered
28
-
29
- expect { BeetleETL.import }.to raise_exception(exception)
30
- end
31
- end
32
10
  end
33
11
 
34
12
  describe '#config' do
@@ -65,6 +43,6 @@ describe BeetleETL do
65
43
  expect(BeetleETL.database).to eql(database)
66
44
  expect(BeetleETL.database).to eql(database)
67
45
  end
68
-
69
46
  end
47
+
70
48
  end
@@ -6,15 +6,16 @@ module BeetleETL
6
6
  subject { DSL.new(:foo_table) }
7
7
 
8
8
  describe '#stage_table' do
9
- it 'returns the stage table name including the schema defined in the config' do
10
- BeetleETL.configure { |config| config.stage_schema = 'bar' }
11
- expect(subject.stage_table).to eql('"bar"."foo_table"')
9
+ it 'returns the current stage table name' do
10
+ expect(subject.stage_table).to eql(
11
+ BeetleETL::Naming.stage_table_name_sql(:foo_table)
12
+ )
12
13
  end
13
- end
14
14
 
15
- describe '#external_source' do
16
- it 'returns the external source‘s identifier' do
17
- expect(subject.external_source).to eql('source')
15
+ it 'returns the stage table name for the given table' do
16
+ expect(subject.stage_table(:bar_table)).to eql(
17
+ BeetleETL::Naming.stage_table_name_sql(:bar_table)
18
+ )
18
19
  end
19
20
  end
20
21
 
@@ -32,13 +33,5 @@ module BeetleETL
32
33
  end
33
34
  end
34
35
 
35
- describe '#import_run_id' do
36
- it 'returns the import run id defined in the config' do
37
- id = double(:id)
38
- allow(BeetleETL.state).to receive(:run_id) { id }
39
- expect(subject.import_run_id).to eql(id)
40
- end
41
- end
42
-
43
36
  end
44
37
  end
@@ -4,6 +4,8 @@ require 'tempfile'
4
4
  module BeetleETL
5
5
  describe TransformationLoader do
6
6
 
7
+ subject { TransformationLoader.new }
8
+
7
9
  before :example do
8
10
  data_file = tempfile_with_contents <<-FILE
9
11
  import :foo do
@@ -13,6 +15,10 @@ module BeetleETL
13
15
  import :bar do
14
16
  'bar'
15
17
  end
18
+
19
+ helpers do
20
+ "baz"
21
+ end
16
22
  FILE
17
23
 
18
24
  BeetleETL.configure do |config|
@@ -21,16 +27,17 @@ module BeetleETL
21
27
  end
22
28
 
23
29
  describe '#load' do
24
- it 'loads runlist entries from the data file' do
25
- expect(Transformation).to receive(:new) do |table_name, config|
30
+ it 'loads transformations from the data file' do
31
+ expect(Transformation).to receive(:new) do |table_name, config, helpers|
26
32
  expect(table_name.to_s).to eql(config.call)
33
+ expect(helpers.call).to eql("baz")
27
34
  end.exactly(2).times
28
35
 
29
36
  subject.load
30
37
  end
31
38
 
32
- it 'adds every runlist entry to the entries array' do
33
- allow(Transformation).to receive(:new) do |table_name, config|
39
+ it 'returns the list of transformations' do
40
+ allow(Transformation).to receive(:new) do |table_name, config, helpers|
34
41
  table_name
35
42
  end
36
43
 
@@ -10,15 +10,36 @@ module BeetleETL
10
10
  end
11
11
  end
12
12
 
13
+ describe 'columns' do
14
+ it 'returns a list of payload column name symbols' do
15
+ setup = Proc.new do
16
+ columns :payload_1, 'payload_2'
17
+ end
18
+ transformation = Transformation.new(:table, setup)
19
+
20
+ expect(transformation.column_names).to match_array([
21
+ :payload_1, :payload_2
22
+ ])
23
+ end
24
+
25
+ it 'defaults to an empty array if no columns are defined' do
26
+ transformation = Transformation.new(:table, Proc.new {})
27
+
28
+ expect(transformation.column_names).to match_array([])
29
+ end
30
+ end
31
+
13
32
  describe '#relations' do
14
33
  it 'returns the list of foreign tables and their foreign key column' do
15
34
  setup = Proc.new do
16
35
  references :foreign_table, on: :foreign_table_id
36
+ references :another_foreign_table, on: :another_foreign_table_id
17
37
  end
18
38
  transformation = Transformation.new(:table, setup)
19
39
 
20
40
  expect(transformation.relations).to eql({
21
- foreign_table_id: :foreign_table
41
+ foreign_table_id: :foreign_table,
42
+ another_foreign_table_id: :another_foreign_table
22
43
  })
23
44
  end
24
45
  end
@@ -36,18 +57,33 @@ module BeetleETL
36
57
  end
37
58
 
38
59
  describe '#query' do
39
- it 'returns the query interpolating methods in scope' do
60
+ it 'returns the query interpolating methods defined as helpers' do
61
+ helpers = Proc.new do
62
+ def foo; "foo_string"; end
63
+ end
40
64
 
41
65
  setup = Proc.new do
42
- def foo; "foo_string"; end
43
66
  query "SELECT '#{foo}' FROM some_table"
44
67
  end
45
- transformation = Transformation.new(:table, setup)
68
+
69
+ transformation = Transformation.new(:table, setup, helpers)
46
70
 
47
71
  expect(transformation.query).to eql(
48
72
  "SELECT 'foo_string' FROM some_table"
49
73
  )
50
74
  end
75
+
76
+ it 'concatenates multiple queries' do
77
+ setup = Proc.new do
78
+ query "SOME QUERY"
79
+ query "ANOTHER QUERY"
80
+ end
81
+ transformation = Transformation.new(:table, setup)
82
+
83
+ expect(transformation.query).to eql(
84
+ "SOME QUERY;ANOTHER QUERY"
85
+ )
86
+ end
51
87
  end
52
88
 
53
89
  end