beetle_etl 0.0.2 → 0.0.7

Sign up to get free protection for your applications and to get access to all the features.
Files changed (36) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +6 -0
  3. data/beetle_etl.gemspec +5 -6
  4. data/lib/beetle_etl.rb +7 -12
  5. data/lib/beetle_etl/dsl/dsl.rb +11 -12
  6. data/lib/beetle_etl/dsl/transformation.rb +10 -3
  7. data/lib/beetle_etl/dsl/transformation_loader.rb +13 -5
  8. data/lib/beetle_etl/import.rb +11 -4
  9. data/lib/beetle_etl/naming.rb +37 -0
  10. data/lib/beetle_etl/steps/assign_ids.rb +14 -38
  11. data/lib/beetle_etl/steps/create_stage.rb +59 -0
  12. data/lib/beetle_etl/steps/drop_stage.rb +15 -0
  13. data/lib/beetle_etl/steps/load.rb +46 -61
  14. data/lib/beetle_etl/steps/map_relations.rb +8 -14
  15. data/lib/beetle_etl/steps/step.rb +1 -8
  16. data/lib/beetle_etl/steps/table_diff.rb +68 -89
  17. data/lib/beetle_etl/steps/transform.rb +2 -4
  18. data/lib/beetle_etl/version.rb +1 -1
  19. data/spec/beetle_etl_spec.rb +3 -25
  20. data/spec/dsl/dsl_spec.rb +8 -15
  21. data/spec/dsl/transformation_loader_spec.rb +11 -4
  22. data/spec/dsl/transformation_spec.rb +40 -4
  23. data/spec/feature/example_schema.rb +2 -137
  24. data/spec/feature/example_transform.rb +13 -6
  25. data/spec/feature/feature_spec.rb +119 -18
  26. data/spec/steps/assign_ids_spec.rb +23 -28
  27. data/spec/steps/create_stage_spec.rb +89 -0
  28. data/spec/steps/load_spec.rb +15 -23
  29. data/spec/steps/map_relations_spec.rb +32 -36
  30. data/spec/steps/table_diff_spec.rb +41 -45
  31. data/spec/steps/transform_spec.rb +2 -0
  32. data/spec/{dependency_resolver_spec.rb → task_runner/dependency_resolver_spec.rb} +0 -0
  33. metadata +22 -36
  34. data/lib/beetle_etl/state.rb +0 -67
  35. data/spec/import_spec.rb +0 -7
  36. data/spec/state_spec.rb +0 -124
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: b08e51fe6819079c6b8317636030d5d678b102cc
4
- data.tar.gz: 6afa19db3171f105fa275f99c82c07306aa9c568
3
+ metadata.gz: 9d792ed373d43d6c0ab7ec27241eaf18fb59736b
4
+ data.tar.gz: 7b08688ce87fbff7eea72b362773444ec4700b2c
5
5
  SHA512:
6
- metadata.gz: a915efc65e4450aa4ba7cec71a75b89082d6b65be4cd2e6db88ee8e540f9bc0d1db96afd18c787ac504309460447bfdbb20ab462e012ac503e8fe8cdecb65880
7
- data.tar.gz: afc74720f9875d7a447030bb14fbc4da90c1004b5c725fb55b0630bbc073d3d15560e6c5e2b7002951e1989eb4b63b956508411811c146986820aee2679a6462
6
+ metadata.gz: 7022ab2cc6a60f57f061d3b1acaa8a125da6db7e9cec76421189ae2fa2c4729b4134cb8246a8c14283aeb7769e1ef92a75f480e2d6be3a3e41ba2e8017b7ee2c
7
+ data.tar.gz: fa6e2f2f74cf1c4a6a03e53c2d76c962154e8d64f656d32690101bdd4dfa636dda9036213b29dfc675d07e9ed381a6527e94abb462f4f7eabfa3c3f11f279ee1
data/Gemfile CHANGED
@@ -2,3 +2,9 @@ source 'https://rubygems.org'
2
2
 
3
3
  # Specify your gem's dependencies in beetle_etl.gemspec
4
4
  gemspec
5
+
6
+ group :test do
7
+ gem 'rake'
8
+ gem 'codeclimate-test-reporter'
9
+ gem 'byebug'
10
+ end
@@ -18,12 +18,11 @@ Gem::Specification.new do |spec|
18
18
  spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
19
  spec.require_paths = ['lib']
20
20
 
21
- spec.add_runtime_dependency 'sequel', '>= 4.13.0'
21
+ spec.add_runtime_dependency 'sequel', '>= 4.0.0'
22
22
 
23
23
  spec.add_development_dependency 'bundler', '~> 1.6'
24
- spec.add_development_dependency 'rake'
25
- spec.add_development_dependency 'rspec', '~> 3.1.0'
26
- spec.add_development_dependency 'pg'
27
- spec.add_development_dependency 'codeclimate-test-reporter'
28
- spec.add_development_dependency 'activesupport'
24
+ spec.add_development_dependency 'rspec', '>= 3.0.0'
25
+ spec.add_development_dependency 'timecop', '>= 0.7.0'
26
+ spec.add_development_dependency 'pg', '>= 0.18.0'
27
+ spec.add_development_dependency 'activesupport', '>= 4.2.0'
29
28
  end
@@ -10,17 +10,20 @@ module BeetleETL
10
10
  require 'beetle_etl/dsl/transformation'
11
11
  require 'beetle_etl/dsl/transformation_loader'
12
12
 
13
+ require 'beetle_etl/naming'
14
+
13
15
  require 'beetle_etl/steps/step'
16
+ require 'beetle_etl/steps/create_stage'
14
17
  require 'beetle_etl/steps/transform'
15
18
  require 'beetle_etl/steps/map_relations'
16
19
  require 'beetle_etl/steps/table_diff'
17
20
  require 'beetle_etl/steps/assign_ids'
18
21
  require 'beetle_etl/steps/load'
22
+ require 'beetle_etl/steps/drop_stage'
19
23
 
20
24
  require 'beetle_etl/task_runner/dependency_resolver'
21
25
  require 'beetle_etl/task_runner/task_runner'
22
26
 
23
- require 'beetle_etl/state'
24
27
  require 'beetle_etl/import'
25
28
 
26
29
  class Configuration
@@ -29,23 +32,20 @@ module BeetleETL
29
32
  :database,
30
33
  :transformation_file,
31
34
  :stage_schema,
35
+ :public_schema,
32
36
  :external_source
33
37
 
34
38
  def initialize
35
- @stage_schema = 'stage'
39
+ @public_schema = 'public'
36
40
  end
37
41
  end
38
42
 
39
43
  class << self
40
44
 
41
45
  def import
42
- state.start_import
43
-
44
46
  begin
45
- Import.run
46
- state.mark_as_succeeded
47
+ Import.new.run
47
48
  rescue Exception => e
48
- state.mark_as_failed
49
49
  raise e
50
50
  ensure
51
51
  @database.disconnect if @database
@@ -71,13 +71,8 @@ module BeetleETL
71
71
  end
72
72
  end
73
73
 
74
- def state
75
- @state ||= State.new
76
- end
77
-
78
74
  def reset
79
75
  @config = nil
80
- @state = nil
81
76
  @database = nil
82
77
  end
83
78
 
@@ -1,11 +1,17 @@
1
1
  module BeetleETL
2
2
  class DSL
3
3
 
4
- attr_reader :relations, :query_string
4
+ attr_reader :column_names, :relations, :query_strings
5
5
 
6
6
  def initialize(table_name)
7
7
  @table_name = table_name
8
+ @column_names = []
8
9
  @relations = {}
10
+ @query_strings = []
11
+ end
12
+
13
+ def columns(*column_names)
14
+ @column_names = column_names
9
15
  end
10
16
 
11
17
  def references(foreign_table, on: foreign_key)
@@ -13,25 +19,18 @@ module BeetleETL
13
19
  end
14
20
 
15
21
  def query(query)
16
- @query_string = query
22
+ @query_strings << query
17
23
  end
18
24
 
25
+ # query helper methods
19
26
 
20
- def stage_table
21
- %Q("#{BeetleETL.config.stage_schema}"."#{@table_name}")
22
- end
23
-
24
- def external_source
25
- 'source'
27
+ def stage_table(table_name = nil)
28
+ BeetleETL::Naming.stage_table_name_sql(table_name || @table_name)
26
29
  end
27
30
 
28
31
  def combined_key(*args)
29
32
  %Q('[' || #{args.join(%q[ || ',' || ])} || ']')
30
33
  end
31
34
 
32
- def import_run_id
33
- BeetleETL.state.run_id
34
- end
35
-
36
35
  end
37
36
  end
@@ -5,9 +5,16 @@ module BeetleETL
5
5
 
6
6
  attr_reader :table_name
7
7
 
8
- def initialize(table_name, setup)
8
+ def initialize(table_name, setup, helpers = nil)
9
9
  @table_name = table_name
10
- (@parsed = DSL.new(table_name)).instance_eval(&setup)
10
+ @parsed = DSL.new(table_name).tap do |dsl|
11
+ dsl.instance_eval(&helpers) if helpers
12
+ dsl.instance_eval(&setup)
13
+ end
14
+ end
15
+
16
+ def column_names
17
+ @parsed.column_names.map(&:to_sym)
11
18
  end
12
19
 
13
20
  def relations
@@ -19,7 +26,7 @@ module BeetleETL
19
26
  end
20
27
 
21
28
  def query
22
- @parsed.query_string
29
+ @parsed.query_strings.join(';')
23
30
  end
24
31
 
25
32
  end
@@ -1,21 +1,29 @@
1
1
  module BeetleETL
2
- module TransformationLoader
3
- extend self
2
+ class TransformationLoader
4
3
 
5
- def load
4
+ def initialize
6
5
  @transformations = []
6
+ @helper_definitions = nil
7
+ end
7
8
 
9
+ def load
8
10
  File.open(BeetleETL.config.transformation_file, 'r') do |file|
9
11
  instance_eval file.read
10
12
  end
11
13
 
12
- @transformations
14
+ @transformations.map do |(table_name, setup)|
15
+ Transformation.new(table_name, setup, @helper_definitions)
16
+ end
13
17
  end
14
18
 
15
19
  private
16
20
 
17
21
  def import(table_name, &setup)
18
- @transformations << Transformation.new(table_name, setup)
22
+ @transformations << [table_name, setup]
23
+ end
24
+
25
+ def helpers(&helper_definitions)
26
+ @helper_definitions = helper_definitions
19
27
  end
20
28
 
21
29
  end
@@ -1,13 +1,15 @@
1
1
  module BeetleETL
2
- module Import
3
-
4
- extend self
2
+ class Import
5
3
 
6
4
  def run
7
5
  TaskRunner.new(data_steps).run
8
6
  BeetleETL.database.transaction do
9
7
  TaskRunner.new(load_steps).run
10
8
  end
9
+ rescue => e
10
+ raise e
11
+ ensure
12
+ TaskRunner.new(cleanup_steps).run
11
13
  end
12
14
 
13
15
  private
@@ -15,6 +17,7 @@ module BeetleETL
15
17
  def data_steps
16
18
  transformations.flat_map do |t|
17
19
  [
20
+ CreateStage.new(t.table_name, t.relations, t.column_names),
18
21
  Transform.new(t.table_name, t.dependencies, t.query),
19
22
  MapRelations.new(t.table_name, t.relations),
20
23
  TableDiff.new(t.table_name),
@@ -29,8 +32,12 @@ module BeetleETL
29
32
  end
30
33
  end
31
34
 
35
+ def cleanup_steps
36
+ transformations.map { |t| DropStage.new(t.table_name) }
37
+ end
38
+
32
39
  def transformations
33
- @transformations ||= TransformationLoader.load
40
+ @transformations ||= TransformationLoader.new.load
34
41
  end
35
42
 
36
43
  end
@@ -0,0 +1,37 @@
1
+ require 'digest'
2
+
3
+ module BeetleETL
4
+ module Naming
5
+
6
+ extend self
7
+
8
+ def stage_table_name(table_name = nil)
9
+ name = (table_name || @table_name).to_s
10
+ digest = Digest::MD5.hexdigest(name)
11
+ "#{name}-#{digest}"[0, 63]
12
+ end
13
+
14
+ def stage_table_name_sql(table_name = nil)
15
+ %Q("#{stage_table_name(table_name)}")
16
+ end
17
+
18
+ def public_table_name(table_name = nil)
19
+ name = (table_name || @table_name).to_s
20
+ [public_schema, name].compact.join('.')
21
+ end
22
+
23
+ def public_table_name_sql(table_name = nil)
24
+ name = (table_name || @table_name).to_s
25
+ public_table_name= [public_schema, name].compact.join('"."')
26
+ %Q("#{public_table_name}")
27
+ end
28
+
29
+ private
30
+
31
+ def public_schema
32
+ public_schema = BeetleETL.config.public_schema
33
+ public_schema != 'public' ? public_schema : nil
34
+ end
35
+
36
+ end
37
+ end
@@ -6,50 +6,26 @@ module BeetleETL
6
6
  end
7
7
 
8
8
  def run
9
- [
10
- Thread.new { assign_new_ids },
11
- Thread.new { map_existing_ids }
12
- ].each(&:join)
9
+ assign_new_ids
10
+ map_existing_ids
13
11
  end
14
12
 
15
13
  def assign_new_ids
16
- stage_table
17
- .where(
18
- import_run_id: run_id,
19
- transition: 'CREATE'
20
- )
21
- .update(
22
- id: Sequel.function(:NEXTVAL, "public.#{table_name}_id_seq")
23
- )
14
+ database.execute <<-SQL
15
+ UPDATE #{stage_table_name_sql}
16
+ SET id = nextval('#{table_name}_id_seq')
17
+ WHERE transition = 'CREATE'
18
+ SQL
24
19
  end
25
20
 
26
21
  def map_existing_ids
27
- stage_table
28
- .from(stage_table_identifier, public_table_identifier)
29
- .where(
30
- stage__import_run_id: run_id,
31
- stage__transition: %w(KEEP UPDATE DELETE UNDELETE),
32
- stage__external_id: :public__external_id
33
- )
34
- .update(id: :public__id)
35
- end
36
-
37
- private
38
-
39
- def stage_table_identifier
40
- :"#{stage_schema}__#{table_name}___stage"
41
- end
42
-
43
- def stage_table
44
- database[stage_table_identifier]
45
- end
46
-
47
- def public_table_identifier
48
- :"#{table_name}___public"
49
- end
50
-
51
- def public_table
52
- database[public_table_identifier]
22
+ database.execute <<-SQL
23
+ UPDATE #{stage_table_name_sql} stage
24
+ SET id = public.id
25
+ FROM #{public_table_name_sql} public
26
+ WHERE stage.transition IN ('KEEP', 'UPDATE', 'DELETE', 'UNDELETE')
27
+ AND stage.external_id = public.external_id
28
+ SQL
53
29
  end
54
30
 
55
31
  end
@@ -0,0 +1,59 @@
1
+ module BeetleETL
2
+ class CreateStage < Step
3
+
4
+ def initialize(table_name, relations, column_names)
5
+ super(table_name)
6
+ @relations = relations
7
+ @column_names = column_names
8
+ end
9
+
10
+ def dependencies
11
+ Set.new
12
+ end
13
+
14
+ def run
15
+ database.execute <<-SQL
16
+ CREATE TABLE #{stage_table_name_sql} (
17
+ id integer,
18
+ external_id character varying(255),
19
+ transition character varying(255),
20
+
21
+ #{[
22
+ payload_column_definitions,
23
+ relation_column_definitions
24
+ ].compact.join(',')}
25
+ )
26
+ SQL
27
+ end
28
+
29
+ private
30
+
31
+ def payload_column_definitions
32
+ definitions = (@column_names - @relations.keys).map do |column_name|
33
+ "#{column_name} #{column_type(column_name)}"
34
+ end
35
+ definitions.join(',') if definitions.any?
36
+ end
37
+
38
+ def relation_column_definitions
39
+ definitions = @relations.map do |foreign_key_name, table|
40
+ <<-SQL
41
+ #{foreign_key_name} integer,
42
+ external_#{foreign_key_name} character varying(255)
43
+ SQL
44
+ end
45
+ definitions.join(',') if definitions.any?
46
+ end
47
+
48
+ def column_type(column_name)
49
+ @column_types ||= Hash[database.schema(public_table_name.to_sym)]
50
+ .reduce({}) do |acc, (name, schema)|
51
+ acc[name.to_sym] = schema.fetch(:db_type)
52
+ acc
53
+ end
54
+
55
+ @column_types[column_name]
56
+ end
57
+
58
+ end
59
+ end
@@ -0,0 +1,15 @@
1
+ module BeetleETL
2
+ class DropStage < Step
3
+
4
+ def dependencies
5
+ Set.new
6
+ end
7
+
8
+ def run
9
+ database.execute <<-SQL
10
+ DROP TABLE IF EXISTS #{stage_table_name_sql}
11
+ SQL
12
+ end
13
+
14
+ end
15
+ end
@@ -2,93 +2,78 @@ module BeetleETL
2
2
  class Load < Step
3
3
 
4
4
  IMPORTER_COLUMNS = %i[
5
- import_run_id
6
5
  external_source
7
6
  transition
8
7
  ]
9
8
 
10
- attr_reader :relations
11
-
12
9
  def initialize(table_name, relations)
13
10
  super(table_name)
14
11
  @relations = relations
15
12
  end
16
13
 
17
14
  def run
18
- %w(create update delete undelete).map do |transition|
19
- Thread.new { public_send(:"load_#{transition}") }
20
- end.each(&:join)
15
+ %w(create update delete undelete).each do |transition|
16
+ public_send(:"load_#{transition}")
17
+ end
21
18
  end
22
19
 
23
20
  def dependencies
24
- relations.values.map { |d| Load.step_name(d) }.to_set
21
+ @relations.values.map { |d| Load.step_name(d) }.to_set
25
22
  end
26
23
 
27
24
  def load_create
28
25
  just_now = now
29
- database[table_name].import(
30
- data_columns + [:external_source, :created_at, :updated_at],
31
- database[:"#{stage_schema}__#{table_name}"]
32
- .select(*data_columns)
33
- .where(
34
- import_run_id: run_id,
35
- transition: 'CREATE'
36
- )
37
- .select_more(external_source, just_now, just_now)
38
- )
26
+
27
+ database.execute <<-SQL
28
+ INSERT INTO #{public_table_name_sql}
29
+ (#{data_columns.join(', ')}, external_source, created_at, updated_at)
30
+ SELECT
31
+ #{data_columns.join(', ')},
32
+ '#{external_source}',
33
+ '#{just_now}',
34
+ '#{just_now}'
35
+ FROM #{stage_table_name_sql}
36
+ WHERE transition = 'CREATE'
37
+ SQL
39
38
  end
40
39
 
41
40
  def load_update
42
- updates = updatable_columns.reduce({updated_at: now}) do |acc, column|
43
- acc[column] = :"stage__#{column}"
44
- acc
45
- end
46
-
47
- database.from(
48
- :"#{table_name}___public",
49
- :"#{stage_schema}__#{table_name}___stage"
50
- )
51
- .where(
52
- stage__id: :public__id,
53
- stage__transition: 'UPDATE',
54
- stage__import_run_id: run_id,
55
- )
56
- .update(updates)
41
+ database.execute <<-SQL
42
+ UPDATE #{public_table_name_sql} public
43
+ SET
44
+ #{updatable_columns.map { |c| %Q("#{c}" = stage."#{c}") }.join(',')},
45
+ "updated_at" = '#{now}'
46
+ FROM #{stage_table_name_sql} stage
47
+ WHERE stage.id = public.id
48
+ AND stage.transition = 'UPDATE'
49
+ SQL
57
50
  end
58
51
 
59
52
  def load_delete
60
53
  just_now = now
61
- database.from(
62
- :"#{table_name}___public",
63
- :"#{stage_schema}__#{table_name}___stage"
64
- )
65
- .where(
66
- stage__id: :public__id,
67
- stage__transition: 'DELETE',
68
- stage__import_run_id: run_id,
69
- )
70
- .update(
71
- updated_at: just_now,
72
- deleted_at: just_now,
73
- )
54
+
55
+ database.execute <<-SQL
56
+ UPDATE #{public_table_name_sql} public
57
+ SET
58
+ updated_at = '#{just_now}',
59
+ deleted_at = '#{just_now}'
60
+ FROM #{stage_table_name_sql} stage
61
+ WHERE stage.id = public.id
62
+ AND stage.transition = 'DELETE'
63
+ SQL
74
64
  end
75
65
 
76
66
  def load_undelete
77
- updates = updatable_columns.reduce({updated_at: now, deleted_at: nil}) do |acc, column|
78
- acc[column] = :"stage__#{column}"
79
- acc
80
- end
81
-
82
- database.from(
83
- :"#{table_name}___public",
84
- :"#{stage_schema}__#{table_name}___stage"
85
- )
86
- .where(
87
- stage__id: :public__id,
88
- stage__transition: 'UNDELETE',
89
- stage__import_run_id: run_id,
90
- )
91
- .update(updates)
67
+ database.execute <<-SQL
68
+ UPDATE #{public_table_name_sql} public
69
+ SET
70
+ #{updatable_columns.map { |c| %Q("#{c}" = stage."#{c}") }.join(',')},
71
+ updated_at = '#{now}',
72
+ deleted_at = NULL
73
+ FROM #{stage_table_name_sql} stage
74
+ WHERE stage.id = public.id
75
+ AND stage.transition = 'UNDELETE'
76
+ SQL
92
77
  end
93
78
 
94
79
  private
@@ -98,7 +83,7 @@ module BeetleETL
98
83
  end
99
84
 
100
85
  def table_columns
101
- @table_columns ||= database[:"#{stage_schema}__#{table_name}"].columns
86
+ @table_columns ||= database[stage_table_name.to_sym].columns
102
87
  end
103
88
 
104
89
  def ignored_columns