beetle_etl 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (45) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +23 -0
  3. data/.travis.yml +12 -0
  4. data/Gemfile +4 -0
  5. data/LICENSE.txt +22 -0
  6. data/README.md +31 -0
  7. data/Rakefile +5 -0
  8. data/beetle_etl.gemspec +30 -0
  9. data/lib/beetle_etl.rb +85 -0
  10. data/lib/beetle_etl/dsl/dsl.rb +37 -0
  11. data/lib/beetle_etl/dsl/transformation.rb +26 -0
  12. data/lib/beetle_etl/dsl/transformation_loader.rb +22 -0
  13. data/lib/beetle_etl/import.rb +37 -0
  14. data/lib/beetle_etl/state.rb +67 -0
  15. data/lib/beetle_etl/steps/assign_ids.rb +54 -0
  16. data/lib/beetle_etl/steps/load.rb +108 -0
  17. data/lib/beetle_etl/steps/map_relations.rb +31 -0
  18. data/lib/beetle_etl/steps/step.rb +42 -0
  19. data/lib/beetle_etl/steps/table_diff.rb +155 -0
  20. data/lib/beetle_etl/steps/transform.rb +22 -0
  21. data/lib/beetle_etl/task_runner/dependency_resolver.rb +39 -0
  22. data/lib/beetle_etl/task_runner/task_runner.rb +64 -0
  23. data/lib/beetle_etl/version.rb +3 -0
  24. data/script/postgres +12 -0
  25. data/spec/beetle_etl_spec.rb +70 -0
  26. data/spec/dependency_resolver_spec.rb +57 -0
  27. data/spec/dsl/dsl_spec.rb +44 -0
  28. data/spec/dsl/transformation_loader_spec.rb +51 -0
  29. data/spec/dsl/transformation_spec.rb +54 -0
  30. data/spec/feature/example_schema.rb +192 -0
  31. data/spec/feature/example_transform.rb +37 -0
  32. data/spec/feature/feature_spec.rb +48 -0
  33. data/spec/import_spec.rb +7 -0
  34. data/spec/spec_helper.rb +25 -0
  35. data/spec/state_spec.rb +124 -0
  36. data/spec/steps/assign_ids_spec.rb +107 -0
  37. data/spec/steps/load_spec.rb +148 -0
  38. data/spec/steps/map_relations_spec.rb +92 -0
  39. data/spec/steps/step_spec.rb +37 -0
  40. data/spec/steps/table_diff_spec.rb +183 -0
  41. data/spec/steps/transform_spec.rb +34 -0
  42. data/spec/support/database.yml.example +9 -0
  43. data/spec/support/database.yml.travis +4 -0
  44. data/spec/support/database_helpers.rb +58 -0
  45. metadata +220 -0
@@ -0,0 +1,37 @@
1
+ import :organisations do
2
+ query <<-SQL
3
+ INSERT INTO #{stage_table} (
4
+ external_id,
5
+ import_run_id,
6
+ name
7
+ )
8
+
9
+ SELECT DISTINCT
10
+ o."Name",
11
+ #{import_run_id},
12
+ o."Name"
13
+
14
+ FROM source."Organisation" o
15
+ SQL
16
+ end
17
+
18
+ import :departments do
19
+ references :organisations, on: :organisation_id
20
+
21
+ query <<-SQL
22
+ INSERT INTO #{stage_table} (
23
+ external_id,
24
+ import_run_id,
25
+ name,
26
+ external_organisation_id
27
+ )
28
+
29
+ SELECT
30
+ #{combined_key('o."Name"', 'o."pkOrgId"')},
31
+ #{import_run_id},
32
+ o."Abteilung",
33
+ o."Name"
34
+
35
+ FROM source."Organisation" o
36
+ SQL
37
+ end
@@ -0,0 +1,48 @@
1
+ require 'spec_helper'
2
+ require_relative 'example_schema'
3
+ require 'yaml'
4
+
5
+ require 'active_support/core_ext/date/calculations'
6
+ require 'active_support/core_ext/numeric/time'
7
+
8
+ describe BeetleETL do
9
+
10
+ include ExampleSchema
11
+
12
+ let!(:now) { Time.new(2014, 07, 17, 16, 12).beginning_of_day }
13
+ before { allow(Time).to receive(:now) { now } }
14
+
15
+ before { create_tables }
16
+ after { drop_tables }
17
+
18
+ it 'is a working', :feature do
19
+ insert_into(:source__Organisation).values(
20
+ [ :pkOrgId , :Name , :Abteilung ] ,
21
+ [ 1 , 'Apple' , 'iPhone' ] ,
22
+ [ 2 , 'Apple' , 'MacBook' ] ,
23
+ )
24
+
25
+ BeetleETL.configure do |config|
26
+ config.transformation_file = File.expand_path('../example_transform.rb', __FILE__)
27
+ config.database = test_database
28
+ config.external_source = 'source_name'
29
+ config.stage_schema = 'stage'
30
+ end
31
+
32
+
33
+ BeetleETL.import
34
+
35
+
36
+ expect(:organisations).to have_values(
37
+ [ :id , :external_id , :external_source , :name , :created_at , :updated_at , :deleted_at ] ,
38
+ [ 1 , 'Apple' , 'source_name' , 'Apple' , now , now , nil ]
39
+ )
40
+
41
+ expect(:departments).to have_values(
42
+ [ :id , :external_id , :external_source , :name , :organisation_id , :created_at , :updated_at , :deleted_at ] ,
43
+ [ 1 , '[Apple,1]' , 'source_name' , 'iPhone' , 1 , now , now , nil ] ,
44
+ [ 2 , '[Apple,2]' , 'source_name' , 'MacBook' , 1 , now , now , nil ] ,
45
+ )
46
+ end
47
+
48
+ end
@@ -0,0 +1,7 @@
1
+ require 'spec_helper'
2
+
3
+ module BeetleETL
4
+ describe Import do
5
+
6
+ end
7
+ end
@@ -0,0 +1,25 @@
1
+ require "codeclimate-test-reporter"
2
+ CodeClimate::TestReporter.start
3
+
4
+ require_relative '../lib/beetle_etl.rb'
5
+ require_relative 'support/database_helpers.rb'
6
+
7
+ RSpec.configure do |config|
8
+
9
+ config.include SpecSupport::DatabaseHelpers
10
+ config.backtrace_exclusion_patterns = [/rspec-core/]
11
+
12
+ config.around(:each) do |example|
13
+ BeetleETL.reset
14
+ if example.metadata[:feature]
15
+ example.run
16
+ else
17
+ test_database.transaction do
18
+ example.run
19
+ raise Sequel::Error::Rollback
20
+ end
21
+ end
22
+ end
23
+
24
+ end
25
+
@@ -0,0 +1,124 @@
1
+ require 'spec_helper'
2
+
3
+ require 'active_support/core_ext/date/calculations'
4
+ require 'active_support/core_ext/numeric/time'
5
+
6
+ module BeetleETL
7
+ describe State do
8
+ subject { State.new }
9
+
10
+ before do
11
+ BeetleETL.configure do |config|
12
+ config.stage_schema = 'stage'
13
+ config.database = test_database
14
+ end
15
+
16
+ test_database.create_schema 'stage'
17
+ test_database.create_table :stage__import_runs do
18
+ primary_key :id
19
+ String :state, size: 10, null: false
20
+ DateTime :started_at, null: false
21
+ DateTime :finished_at
22
+ end
23
+ end
24
+
25
+ describe '#start_import' do
26
+ let(:now) { 1.minute.ago.beginning_of_day }
27
+
28
+ it 'registers a new import in the import_runs table' do
29
+ allow(subject).to receive(:now) { now }
30
+
31
+ subject.start_import
32
+
33
+ expect(:stage__import_runs).to have_values(
34
+ [ :id , :state , :started_at , :finished_at ] ,
35
+ [ 1 , 'RUNNING' , now , nil ]
36
+ )
37
+ end
38
+
39
+ it 'raises an exception if there is alreay an import marked as running' do
40
+ insert_into(:stage__import_runs).values(
41
+ [ :id , :state , :started_at , :finished_at ] ,
42
+ [ 1 , 'RUNNING' , now , nil ]
43
+ )
44
+
45
+ expect { subject.start_import }.to raise_exception(BeetleETL::ImportAleadyRunning)
46
+ end
47
+ end
48
+
49
+ context 'run ids' do
50
+ before do
51
+ insert_into(:stage__import_runs).values(
52
+ [ :state , :started_at , :finished_at ] ,
53
+ [ 'FAILED' , 8.days.ago , 7.days.ago ] ,
54
+ [ 'SUCCEEDED' , 6.days.ago , 5.day.ago ] ,
55
+ [ 'SUCCEEDED' , 4.days.ago , 3.days.ago ] ,
56
+ [ 'FAILED' , 2.days.ago , 1.day.ago ] ,
57
+ )
58
+ end
59
+
60
+ describe '#run_id' do
61
+ it 'returns the import‘s id after it has been started' do
62
+ subject.start_import
63
+ expect(subject.run_id).to eql(5)
64
+ end
65
+
66
+ it 'raises an exception when the import has not been started' do
67
+ expect { subject.run_id }.to raise_exception(BeetleETL::ImportNotRunning)
68
+ end
69
+ end
70
+
71
+ describe '#last_run_id' do
72
+ it 'returns nil if there is no last successful import' do
73
+ test_database[:stage__import_runs].update(state: 'FAILED')
74
+
75
+ subject.start_import
76
+ expect(subject.last_run_id).to be_nil
77
+ end
78
+
79
+ it 'returns the id of the last successul import' do
80
+ subject.start_import
81
+ expect(subject.last_run_id).to eql(3)
82
+ end
83
+ end
84
+ end
85
+
86
+ context 'marking imports' do
87
+ let(:now) { 1.minute.ago.beginning_of_day }
88
+ let(:one_day_ago) { 1.day.ago.beginning_of_day }
89
+
90
+ before do
91
+ insert_into(:stage__import_runs).values(
92
+ [ :state , :started_at , :finished_at ] ,
93
+ [ 'SUCCEEDED' , 2.days.ago , one_day_ago ] ,
94
+ )
95
+ allow(subject).to receive(:now) { now }
96
+ subject.start_import
97
+ end
98
+
99
+ describe '#mark_as_failed' do
100
+ it 'marks the current import as FAILED' do
101
+ subject.mark_as_failed
102
+
103
+ expect(:stage__import_runs).to have_values(
104
+ [ :id , :state , :finished_at ] ,
105
+ [ 1 , 'SUCCEEDED' , one_day_ago ] ,
106
+ [ 2 , 'FAILED' , now ] ,
107
+ )
108
+ end
109
+ end
110
+
111
+ describe '#mark_as_succeeded' do
112
+ it 'marks the current import as SUCCEEDED' do
113
+ subject.mark_as_succeeded
114
+
115
+ expect(:stage__import_runs).to have_values(
116
+ [ :id , :state , :finished_at ] ,
117
+ [ 1 , 'SUCCEEDED' , one_day_ago ] ,
118
+ [ 2 , 'SUCCEEDED' , now ] ,
119
+ )
120
+ end
121
+ end
122
+ end
123
+ end
124
+ end
@@ -0,0 +1,107 @@
1
+ require 'spec_helper'
2
+
3
+ module BeetleETL
4
+ describe AssignIds do
5
+
6
+ let(:run_id) { 1 }
7
+ let(:external_source) { 'my_source' }
8
+ subject { AssignIds.new(:example_table) }
9
+
10
+ before do
11
+ BeetleETL.configure do |config|
12
+ config.stage_schema = 'stage'
13
+ config.external_source = external_source
14
+ config.database = test_database
15
+ end
16
+
17
+ allow(BeetleETL).to receive(:state) { double(:state, run_id: run_id) }
18
+
19
+ test_database.create_schema(:stage)
20
+ test_database.create_table(:stage__example_table) do
21
+ Integer :id
22
+ Integer :import_run_id
23
+ String :external_id, size: 255
24
+ String :transition, size: 255
25
+ end
26
+
27
+ test_database.create_table(:example_table) do
28
+ primary_key :id
29
+ String :external_id, size: 255
30
+ String :external_source, size: 255
31
+ end
32
+
33
+ end
34
+
35
+ describe '#dependencies' do
36
+ it 'depends on TableDiff of the same table' do
37
+ expect(subject.dependencies).to eql(['example_table: TableDiff'].to_set)
38
+ end
39
+ end
40
+
41
+ describe '#run' do
42
+ it 'runs all transitions' do
43
+ %i(assign_new_ids map_existing_ids).each do |method|
44
+ expect(subject).to receive(method)
45
+ end
46
+
47
+ subject.run
48
+ end
49
+ end
50
+
51
+ describe '#assign_new_ids' do
52
+ it 'generates new ids for newly created records' do
53
+ insert_into(:example_table).values(
54
+ [ :external_id , :external_source ] ,
55
+ [ 'keep_id' , external_source ] ,
56
+ )
57
+
58
+ insert_into(:stage__example_table).values(
59
+ [ :import_run_id , :external_id , :transition ] ,
60
+ [ run_id , 'create_id' , 'CREATE' ] ,
61
+ [ run_id , 'keep_id' , 'KEEP' ] ,
62
+ )
63
+
64
+ subject.assign_new_ids
65
+
66
+ expect(:stage__example_table).to have_values(
67
+ [ :id , :import_run_id , :external_id , :transition ] ,
68
+ [ 2 , run_id , 'create_id' , 'CREATE' ] ,
69
+ [ nil , run_id , 'keep_id' , 'KEEP' ] ,
70
+ )
71
+ end
72
+ end
73
+
74
+ describe '#map_existing_ids' do
75
+ it 'assigns ids for existing records by their external id' do
76
+ insert_into(:example_table).values(
77
+ [ :external_id , :external_source ] ,
78
+ [ 'keep_id' , external_source ] ,
79
+ [ 'update_id' , external_source ] ,
80
+ [ 'delete_id' , external_source ] ,
81
+ [ 'undelete_id' , external_source ] ,
82
+ )
83
+
84
+ insert_into(:stage__example_table).values(
85
+ [ :import_run_id , :external_id , :transition ] ,
86
+ [ run_id , 'create_id' , 'CREATE' ] ,
87
+ [ run_id , 'keep_id' , 'KEEP' ] ,
88
+ [ run_id , 'update_id' , 'UPDATE' ] ,
89
+ [ run_id , 'delete_id' , 'DELETE' ] ,
90
+ [ run_id , 'undelete_id' , 'UNDELETE' ] ,
91
+ )
92
+
93
+ subject.map_existing_ids
94
+
95
+ expect(:stage__example_table).to have_values(
96
+ [ :id , :import_run_id , :external_id , :transition ] ,
97
+ [ nil , run_id , 'create_id' , 'CREATE' ] ,
98
+ [ 1 , run_id , 'keep_id' , 'KEEP' ] ,
99
+ [ 2 , run_id , 'update_id' , 'UPDATE' ] ,
100
+ [ 3 , run_id , 'delete_id' , 'DELETE' ] ,
101
+ [ 4 , run_id , 'undelete_id' , 'UNDELETE' ] ,
102
+ )
103
+ end
104
+ end
105
+
106
+ end
107
+ end
@@ -0,0 +1,148 @@
1
+ require 'spec_helper'
2
+
3
+ require 'active_support/core_ext/date/calculations'
4
+ require 'active_support/core_ext/numeric/time'
5
+
6
+ module BeetleETL
7
+ describe Load do
8
+
9
+ let(:run_id) { 1 }
10
+ let(:old_run_id) { 5000 }
11
+ let(:external_source) { 'my_source' }
12
+
13
+ let(:now) { Time.now.beginning_of_day }
14
+ let(:yesterday) { 1.day.ago.beginning_of_day }
15
+
16
+ subject { Load.new(:example_table) }
17
+
18
+ before do
19
+ BeetleETL.configure do |config|
20
+ config.stage_schema = 'stage'
21
+ config.external_source = external_source
22
+ config.database = test_database
23
+ end
24
+
25
+ allow(BeetleETL).to receive(:state) { double(:state, run_id: run_id) }
26
+ allow(subject).to receive(:now) { now }
27
+
28
+ test_database.create_schema(:stage)
29
+ test_database.create_table(:stage__example_table) do
30
+ Integer :import_run_id
31
+ Integer :id
32
+ String :external_id, size: 255
33
+ String :transition, size: 20
34
+
35
+ String :external_foo_id, size: 255
36
+ Integer :foo_id
37
+
38
+ String :payload, size: 255
39
+ end
40
+
41
+ test_database.create_table(:example_table) do
42
+ primary_key :id
43
+ String :external_id, size: 255
44
+ String :external_source, size: 255
45
+ DateTime :created_at
46
+ DateTime :updated_at
47
+ DateTime :deleted_at
48
+
49
+ String :payload, size: 255
50
+ String :ignored_attribute, size: 255
51
+ Integer :foo_id
52
+ end
53
+ end
54
+
55
+ describe '#run' do
56
+ it 'runs all load steps' do
57
+ %w(create update delete undelete).each do |transition|
58
+ expect(subject).to receive(:"load_#{transition}")
59
+ end
60
+
61
+ subject.run
62
+ end
63
+ end
64
+
65
+ describe '#load_create' do
66
+ it 'loads records into the public table' do
67
+ insert_into(:stage__example_table).values(
68
+ [ :id , :import_run_id , :external_id , :transition , :external_foo_id , :foo_id , :payload ] ,
69
+ [ 3 , old_run_id , 'external_id' , 'CREATE' , 'foo_id' , 999 , 'some content' ] ,
70
+ [ 3 , run_id , 'external_id' , 'CREATE' , 'foo_id' , 22 , 'content' ] ,
71
+ )
72
+
73
+ subject.load_create
74
+
75
+ expect(:example_table).to have_values(
76
+ [ :id , :external_id , :external_source , :foo_id , :created_at , :updated_at , :deleted_at , :payload ] ,
77
+ [ 3 , 'external_id' , external_source , 22 , now , now , nil , 'content' ] ,
78
+ )
79
+ end
80
+ end
81
+
82
+ describe '#load_update' do
83
+ it 'updates existing records' do
84
+ insert_into(:example_table).values(
85
+ [ :id , :external_id , :external_source , :foo_id , :created_at , :updated_at , :deleted_at , :payload ] ,
86
+ [ 1 , 'external_id' , external_source , 22 , yesterday , yesterday , nil , 'content' ] ,
87
+ )
88
+
89
+ insert_into(:stage__example_table).values(
90
+ [ :id , :import_run_id , :external_id , :transition , :external_foo_id , :foo_id , :payload ] ,
91
+ [ 1 , old_run_id , 'external_id' , 'UPDATE' , 'foo_id' , 999 , 'some content' ] ,
92
+ [ 1 , run_id , 'external_id' , 'UPDATE' , 'foo_id' , 33 , 'updated content' ] ,
93
+ )
94
+
95
+ subject.load_update
96
+
97
+ expect(:example_table).to have_values(
98
+ [ :id , :external_id , :external_source , :foo_id , :created_at , :updated_at , :deleted_at , :payload ] ,
99
+ [ 1 , 'external_id' , external_source , 33 , yesterday , now , nil , 'updated content' ] ,
100
+ )
101
+ end
102
+ end
103
+
104
+ describe '#load_delete' do
105
+ it 'marks existing records as deleted' do
106
+ insert_into(:example_table).values(
107
+ [ :id , :external_id , :external_source , :foo_id , :created_at , :updated_at , :deleted_at , :payload ] ,
108
+ [ 1 , 'external_id' , external_source , 22 , yesterday , yesterday , nil , 'content' ] ,
109
+ )
110
+
111
+ insert_into(:stage__example_table).values(
112
+ [ :id , :import_run_id , :external_id , :transition , :external_foo_id , :foo_id , :payload ] ,
113
+ [ 1 , old_run_id , 'external_id' , 'UPDATE' , 'foo_id' , 999 , 'some content' ] ,
114
+ [ 1 , run_id , 'external_id' , 'DELETE' , 'foo_id' , 33 , 'updated content' ] ,
115
+ )
116
+
117
+ subject.load_delete
118
+
119
+ expect(:example_table).to have_values(
120
+ [ :id , :external_id , :external_source , :foo_id , :created_at , :updated_at , :deleted_at , :payload ] ,
121
+ [ 1 , 'external_id' , external_source , 22 , yesterday , now , now , 'content' ] ,
122
+ )
123
+ end
124
+ end
125
+
126
+ describe '#load_undelete' do
127
+ it 'reinstates deleted records' do
128
+ insert_into(:example_table).values(
129
+ [ :id , :external_id , :external_source , :foo_id , :created_at , :updated_at , :deleted_at , :payload ] ,
130
+ [ 1 , 'external_id' , external_source , 22 , yesterday , yesterday , nil , 'content' ] ,
131
+ )
132
+
133
+ insert_into(:stage__example_table).values(
134
+ [ :id , :import_run_id , :external_id , :transition , :external_foo_id , :foo_id , :payload ] ,
135
+ [ 1 , old_run_id , 'external_id' , 'UPDATE' , 'foo_id' , 999 , 'some content' ] ,
136
+ [ 1 , run_id , 'external_id' , 'UNDELETE' , 'foo_id' , 33 , 'updated content' ] ,
137
+ )
138
+
139
+ subject.load_undelete
140
+
141
+ expect(:example_table).to have_values(
142
+ [ :id , :external_id , :external_source , :foo_id , :created_at , :updated_at , :deleted_at , :payload ] ,
143
+ [ 1 , 'external_id' , external_source , 33 , yesterday , now , nil , 'updated content' ] ,
144
+ )
145
+ end
146
+ end
147
+ end
148
+ end