beetle_etl 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +23 -0
  3. data/.travis.yml +12 -0
  4. data/Gemfile +4 -0
  5. data/LICENSE.txt +22 -0
  6. data/README.md +31 -0
  7. data/Rakefile +5 -0
  8. data/beetle_etl.gemspec +30 -0
  9. data/lib/beetle_etl.rb +85 -0
  10. data/lib/beetle_etl/dsl/dsl.rb +37 -0
  11. data/lib/beetle_etl/dsl/transformation.rb +26 -0
  12. data/lib/beetle_etl/dsl/transformation_loader.rb +22 -0
  13. data/lib/beetle_etl/import.rb +37 -0
  14. data/lib/beetle_etl/state.rb +67 -0
  15. data/lib/beetle_etl/steps/assign_ids.rb +54 -0
  16. data/lib/beetle_etl/steps/load.rb +108 -0
  17. data/lib/beetle_etl/steps/map_relations.rb +31 -0
  18. data/lib/beetle_etl/steps/step.rb +42 -0
  19. data/lib/beetle_etl/steps/table_diff.rb +155 -0
  20. data/lib/beetle_etl/steps/transform.rb +22 -0
  21. data/lib/beetle_etl/task_runner/dependency_resolver.rb +39 -0
  22. data/lib/beetle_etl/task_runner/task_runner.rb +64 -0
  23. data/lib/beetle_etl/version.rb +3 -0
  24. data/script/postgres +12 -0
  25. data/spec/beetle_etl_spec.rb +70 -0
  26. data/spec/dependency_resolver_spec.rb +57 -0
  27. data/spec/dsl/dsl_spec.rb +44 -0
  28. data/spec/dsl/transformation_loader_spec.rb +51 -0
  29. data/spec/dsl/transformation_spec.rb +54 -0
  30. data/spec/feature/example_schema.rb +192 -0
  31. data/spec/feature/example_transform.rb +37 -0
  32. data/spec/feature/feature_spec.rb +48 -0
  33. data/spec/import_spec.rb +7 -0
  34. data/spec/spec_helper.rb +25 -0
  35. data/spec/state_spec.rb +124 -0
  36. data/spec/steps/assign_ids_spec.rb +107 -0
  37. data/spec/steps/load_spec.rb +148 -0
  38. data/spec/steps/map_relations_spec.rb +92 -0
  39. data/spec/steps/step_spec.rb +37 -0
  40. data/spec/steps/table_diff_spec.rb +183 -0
  41. data/spec/steps/transform_spec.rb +34 -0
  42. data/spec/support/database.yml.example +9 -0
  43. data/spec/support/database.yml.travis +4 -0
  44. data/spec/support/database_helpers.rb +58 -0
  45. metadata +220 -0
@@ -0,0 +1,3 @@
1
+ module BeetleETL
2
+ VERSION = "0.0.1"
3
+ end
data/script/postgres ADDED
@@ -0,0 +1,12 @@
1
+ #!/usr/bin/env bash
2
+
3
+ config_file="/usr/local/var/postgres/postgresql.conf"
4
+
5
+ case "$1" in
6
+ "run" )
7
+ postgres -D /usr/local/var/postgres -c config_file=$config_file
8
+ ;;
9
+ * )
10
+ echo "USAGE: $0 {brew|…} run"
11
+ ;;
12
+ esac
@@ -0,0 +1,70 @@
1
+ require 'spec_helper'
2
+
3
+ describe BeetleETL do
4
+ describe '#import' do
5
+
6
+ it 'runs the import' do
7
+ allow(BeetleETL).to receive(:state) { double(:state).as_null_object }
8
+ expect(BeetleETL::Import).to receive(:run)
9
+ BeetleETL.import
10
+ end
11
+
12
+ context 'handling state' do
13
+ it 'starts the import and marks it as finished if no errors are thrown' do
14
+ allow(BeetleETL::Import).to receive(:run)
15
+
16
+ expect(BeetleETL.state).to receive(:start_import).ordered
17
+ expect(BeetleETL.state).to receive(:mark_as_succeeded).ordered
18
+
19
+ BeetleETL.import
20
+ end
21
+
22
+ it 'starts the import and marks it as failed if Import.run throws an error' do
23
+ exception = Exception.new
24
+ allow(BeetleETL::Import).to receive(:run).and_raise(exception)
25
+
26
+ expect(BeetleETL.state).to receive(:start_import).ordered
27
+ expect(BeetleETL.state).to receive(:mark_as_failed).ordered
28
+
29
+ expect { BeetleETL.import }.to raise_exception(exception)
30
+ end
31
+ end
32
+ end
33
+
34
+ describe '#config' do
35
+ it 'returns a configuration object' do
36
+ expect(BeetleETL.config).to be_a(BeetleETL::Configuration)
37
+ end
38
+ end
39
+
40
+ describe '#configure' do
41
+ it 'allows the configuration to be changed' do
42
+ expect(BeetleETL.config.external_source).to be_nil
43
+
44
+ BeetleETL.configure { |config| config.external_source = 'foo' }
45
+
46
+ expect(BeetleETL.config.external_source).to eql('foo')
47
+ end
48
+ end
49
+
50
+ describe '#database' do
51
+ let(:database) { double(:database) }
52
+
53
+ it 'returns the Sequel Database object stored in the config' do
54
+ BeetleETL.configure { |config| config.database = database }
55
+
56
+ expect(BeetleETL.database).to eql(database)
57
+ end
58
+
59
+ it 'builds and caches a Sequel Database from config when no database is passed' do
60
+ database_config = double(:database_config)
61
+ BeetleETL.configure { |config| config.database_config = database_config }
62
+
63
+ expect(Sequel).to receive(:connect).with(database_config).once { database }
64
+
65
+ expect(BeetleETL.database).to eql(database)
66
+ expect(BeetleETL.database).to eql(database)
67
+ end
68
+
69
+ end
70
+ end
@@ -0,0 +1,57 @@
1
+ require 'spec_helper'
2
+
3
+ module BeetleETL
4
+ describe DependencyResolver do
5
+
6
+ # test dependencies
7
+ #
8
+ # A
9
+ # / | \
10
+ # B | C
11
+ # | \ /|
12
+ # | D |
13
+ # \ / \|
14
+ # E F
15
+
16
+ Item = Struct.new(:name, :dependencies)
17
+
18
+ let(:a) { Item.new(:a, Set.new) }
19
+ let(:b) { Item.new(:b, Set.new([:a])) }
20
+ let(:c) { Item.new(:c, Set.new([:a])) }
21
+ let(:d) { Item.new(:d, Set.new([:a, :c])) }
22
+ let(:e) { Item.new(:e, Set.new([:c, :d])) }
23
+ let(:f) { Item.new(:f, Set.new([:c, :d])) }
24
+
25
+ def items
26
+ [a, b, c, d, e, f].shuffle
27
+ end
28
+
29
+ describe '#resolvables' do
30
+ let(:resolver) { DependencyResolver.new(items) }
31
+
32
+ it 'returns all items without dependencies when given an empty array' do
33
+ expect(resolver.resolvables([])).to match_array([a])
34
+ end
35
+
36
+ it 'returns all items with met dependencies' do
37
+ expect(resolver.resolvables([:a, :b, :c])).to match_array([d])
38
+ expect(resolver.resolvables([:a, :b, :c, :d])).to match_array([e, f])
39
+ end
40
+ end
41
+
42
+ context 'with cyclic or missing dependencies' do
43
+ let(:cyclic) { Item.new(:a, Set.new([:b])) }
44
+
45
+ it 'detects cyclic dependencies' do
46
+ expect { DependencyResolver.new([cyclic, b]) }.to \
47
+ raise_error(BeetleETL::UnsatisfiableDependenciesError)
48
+ end
49
+
50
+ it 'detects unsatisfiable dependencies' do
51
+ expect { DependencyResolver.new([b]) }.to \
52
+ raise_error(BeetleETL::UnsatisfiableDependenciesError)
53
+ end
54
+ end
55
+
56
+ end
57
+ end
@@ -0,0 +1,44 @@
1
+ require 'spec_helper'
2
+
3
+ module BeetleETL
4
+ describe DSL do
5
+
6
+ subject { DSL.new(:foo_table) }
7
+
8
+ describe '#stage_table' do
9
+ it 'returns the stage table name including the schema defined in the config' do
10
+ BeetleETL.configure { |config| config.stage_schema = 'bar' }
11
+ expect(subject.stage_table).to eql('"bar"."foo_table"')
12
+ end
13
+ end
14
+
15
+ describe '#external_source' do
16
+ it 'returns the external source‘s identifier' do
17
+ expect(subject.external_source).to eql('source')
18
+ end
19
+ end
20
+
21
+ describe '#combined_key' do
22
+ it 'returns an SQL string for combined external ids' do
23
+ expect(subject.combined_key('foo', 'bar')).to eql(
24
+ %q('[' || foo || ',' || bar || ']')
25
+ )
26
+ end
27
+
28
+ it 'works with multiple arguments' do
29
+ expect(subject.combined_key('foo', 'bar', 'baz')).to eql(
30
+ %q('[' || foo || ',' || bar || ',' || baz || ']')
31
+ )
32
+ end
33
+ end
34
+
35
+ describe '#import_run_id' do
36
+ it 'returns the import run id defined in the config' do
37
+ id = double(:id)
38
+ allow(BeetleETL.state).to receive(:run_id) { id }
39
+ expect(subject.import_run_id).to eql(id)
40
+ end
41
+ end
42
+
43
+ end
44
+ end
@@ -0,0 +1,51 @@
1
+ require 'spec_helper'
2
+ require 'tempfile'
3
+
4
+ module BeetleETL
5
+ describe TransformationLoader do
6
+
7
+ before :example do
8
+ data_file = tempfile_with_contents <<-FILE
9
+ import :foo do
10
+ 'foo'
11
+ end
12
+
13
+ import :bar do
14
+ 'bar'
15
+ end
16
+ FILE
17
+
18
+ BeetleETL.configure do |config|
19
+ config.transformation_file = data_file.path
20
+ end
21
+ end
22
+
23
+ describe '#load' do
24
+ it 'loads runlist entries from the data file' do
25
+ expect(Transformation).to receive(:new) do |table_name, config|
26
+ expect(table_name.to_s).to eql(config.call)
27
+ end.exactly(2).times
28
+
29
+ subject.load
30
+ end
31
+
32
+ it 'adds every runlist entry to the entries array' do
33
+ allow(Transformation).to receive(:new) do |table_name, config|
34
+ table_name
35
+ end
36
+
37
+ transformations = subject.load
38
+
39
+ expect(transformations).to eql(%i[foo bar])
40
+ end
41
+ end
42
+
43
+ def tempfile_with_contents(contents)
44
+ Tempfile.new('transform').tap do |file|
45
+ file.write(contents)
46
+ file.close
47
+ end
48
+ end
49
+
50
+ end
51
+ end
@@ -0,0 +1,54 @@
1
+ require 'spec_helper'
2
+
3
+ module BeetleETL
4
+ describe Transformation do
5
+
6
+ describe '#table_name' do
7
+ it 'returns the given table name' do
8
+ transformation = Transformation.new(:table, Proc.new {})
9
+ expect(transformation.table_name).to eql(:table)
10
+ end
11
+ end
12
+
13
+ describe '#relations' do
14
+ it 'returns the list of foreign tables and their foreign key column' do
15
+ setup = Proc.new do
16
+ references :foreign_table, on: :foreign_table_id
17
+ end
18
+ transformation = Transformation.new(:table, setup)
19
+
20
+ expect(transformation.relations).to eql({
21
+ foreign_table_id: :foreign_table
22
+ })
23
+ end
24
+ end
25
+
26
+ describe '#dependencies' do
27
+ it 'returns the depending tables' do
28
+ setup = Proc.new do
29
+ references :foreign_table, on: :foreign_table_id
30
+ references :another_foreign_table, on: :another_foreign_table_id
31
+ end
32
+ transformation = Transformation.new(:table, setup)
33
+
34
+ expect(transformation.dependencies).to eql(Set.new([:foreign_table, :another_foreign_table]))
35
+ end
36
+ end
37
+
38
+ describe '#query' do
39
+ it 'returns the query interpolating methods in scope' do
40
+
41
+ setup = Proc.new do
42
+ def foo; "foo_string"; end
43
+ query "SELECT '#{foo}' FROM some_table"
44
+ end
45
+ transformation = Transformation.new(:table, setup)
46
+
47
+ expect(transformation.query).to eql(
48
+ "SELECT 'foo_string' FROM some_table"
49
+ )
50
+ end
51
+ end
52
+
53
+ end
54
+ end
@@ -0,0 +1,192 @@
1
+ module ExampleSchema
2
+
3
+ def create_tables
4
+ create_source_tables
5
+ create_stage_tables
6
+ create_target_tables
7
+ end
8
+
9
+ def drop_tables
10
+ drop_source_tables
11
+ drop_stage_tables
12
+ drop_target_tables
13
+ end
14
+
15
+ def create_source_tables
16
+ test_database.create_schema :source
17
+
18
+ test_database.create_table :source__Organisation do
19
+ Integer :pkOrgId
20
+ String :Name, size: 255
21
+ String :Abteilung, size: 255
22
+ end
23
+
24
+ test_database.create_table :source__Person do
25
+ Integer :pkPersID
26
+ String :Vorname, size: 255
27
+ String :Nachname, size: 255
28
+ Integer :fkFirma
29
+ Integer :fkAdresse
30
+ Integer :fkTyp
31
+ end
32
+
33
+ test_database.create_table :source__Veranstaltung do
34
+ Integer :pkVeranstaltungId
35
+ Integer :fkOrganisation
36
+ end
37
+
38
+ test_database.create_table :source__Veranstaltungsbesuch do
39
+ Integer :fkVeranstaltung
40
+ Integer :fkBesucher
41
+ end
42
+ end
43
+
44
+ def drop_source_tables
45
+ test_database.drop_schema :source, cascade: true
46
+ end
47
+
48
+ def create_stage_tables
49
+ test_database.create_schema :stage
50
+
51
+ test_database.create_table :stage__import_runs do
52
+ primary_key :id
53
+ DateTime :started_at
54
+ DateTime :finished_at
55
+ String :state, size: 255
56
+ end
57
+
58
+ test_database.create_table :stage__organisations do
59
+ Integer :id
60
+ String :external_id, size: 255
61
+ foreign_key :import_run_id, :stage__import_runs
62
+ index [:external_id, :import_run_id]
63
+ String :transition, size: 255
64
+
65
+ String :name, size: 255
66
+ end
67
+
68
+ test_database.create_table :stage__departments do
69
+ Integer :id
70
+ String :external_id, size: 255
71
+ foreign_key :import_run_id, :stage__import_runs
72
+ index [:external_id, :import_run_id]
73
+ String :transition, size: 255
74
+
75
+ String :name, size: 255
76
+
77
+ String :external_organisation_id, size: 255
78
+ Integer :organisation_id
79
+
80
+ end
81
+
82
+ test_database.create_table :stage__attendees do
83
+ Integer :id
84
+ String :external_id, size: 255
85
+ foreign_key :import_run_id, :stage__import_runs
86
+ index [:external_id, :import_run_id]
87
+ String :transition, size: 255
88
+
89
+ String :first_name, size: 255
90
+ String :last_name, size: 255
91
+ end
92
+
93
+ test_database.create_table :stage__events do
94
+ Integer :id
95
+ String :external_id, size: 255
96
+ foreign_key :import_run_id, :stage__import_runs
97
+ index [:external_id, :import_run_id]
98
+ String :transition, size: 255
99
+
100
+ String :name, size: 255
101
+ DateTime :starts_at
102
+ DateTime :ends_at
103
+
104
+ String :external_organisations_id, size: 255
105
+ Integer :organisation_id
106
+ end
107
+
108
+ test_database.create_table :stage__attendees_events do
109
+ Integer :id
110
+ String :external_id, size: 255
111
+ foreign_key :import_run_id, :stage__import_runs
112
+ index [:external_id, :import_run_id]
113
+ String :transition, size: 255
114
+
115
+ String :external_attendee_id, size: 255
116
+ Integer :attendee_id
117
+
118
+ String :external_event_id, size: 255
119
+ Integer :event_id
120
+ end
121
+ end
122
+
123
+ def drop_stage_tables
124
+ test_database.drop_schema :stage, cascade: true
125
+ end
126
+
127
+ def create_target_tables
128
+ test_database.create_table :organisations do
129
+ primary_key :id
130
+ String :external_id, size: 255
131
+ String :external_source, size: 255
132
+ String :name, size: 255
133
+ DateTime :created_at
134
+ DateTime :updated_at
135
+ DateTime :deleted_at
136
+ end
137
+
138
+ test_database.create_table :departments do
139
+ primary_key :id
140
+ String :external_id, size: 255
141
+ String :external_source, size: 255
142
+ String :name, size: 255
143
+ foreign_key :organisation_id, :organisations
144
+ DateTime :created_at
145
+ DateTime :updated_at
146
+ DateTime :deleted_at
147
+ end
148
+
149
+ test_database.create_table :attendees do
150
+ primary_key :id
151
+ String :external_id, size: 255
152
+ String :external_source, size: 255
153
+ String :first_name, size: 255
154
+ String :last_name, size: 255
155
+ DateTime :created_at
156
+ DateTime :updated_at
157
+ DateTime :deleted_at
158
+ end
159
+
160
+ test_database.create_table :events do
161
+ primary_key :id
162
+ String :external_id, size: 255
163
+ String :external_source, size: 255
164
+ String :name, size: 255
165
+ DateTime :starts_at
166
+ DateTime :ends_at
167
+ foreign_key :organisation, :organisations
168
+ DateTime :created_at
169
+ DateTime :updated_at
170
+ DateTime :deleted_at
171
+ end
172
+
173
+ test_database.create_table :attendees_events do
174
+ foreign_key :attendee_id, :attendees, null: false
175
+ foreign_key :event_id, :events, null: false
176
+ primary_key [:attendee_id, :event_id]
177
+ index [:attendee_id, :event_id]
178
+ DateTime :created_at
179
+ DateTime :updated_at
180
+ DateTime :deleted_at
181
+ end
182
+ end
183
+
184
+ def drop_target_tables
185
+ test_database.drop_table :attendees_events
186
+ test_database.drop_table :events
187
+ test_database.drop_table :attendees
188
+ test_database.drop_table :departments
189
+ test_database.drop_table :organisations
190
+ end
191
+
192
+ end