beetle_etl 0.0.2 → 0.0.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +6 -0
  3. data/beetle_etl.gemspec +5 -6
  4. data/lib/beetle_etl.rb +7 -12
  5. data/lib/beetle_etl/dsl/dsl.rb +11 -12
  6. data/lib/beetle_etl/dsl/transformation.rb +10 -3
  7. data/lib/beetle_etl/dsl/transformation_loader.rb +13 -5
  8. data/lib/beetle_etl/import.rb +11 -4
  9. data/lib/beetle_etl/naming.rb +37 -0
  10. data/lib/beetle_etl/steps/assign_ids.rb +14 -38
  11. data/lib/beetle_etl/steps/create_stage.rb +59 -0
  12. data/lib/beetle_etl/steps/drop_stage.rb +15 -0
  13. data/lib/beetle_etl/steps/load.rb +46 -61
  14. data/lib/beetle_etl/steps/map_relations.rb +8 -14
  15. data/lib/beetle_etl/steps/step.rb +1 -8
  16. data/lib/beetle_etl/steps/table_diff.rb +68 -89
  17. data/lib/beetle_etl/steps/transform.rb +2 -4
  18. data/lib/beetle_etl/version.rb +1 -1
  19. data/spec/beetle_etl_spec.rb +3 -25
  20. data/spec/dsl/dsl_spec.rb +8 -15
  21. data/spec/dsl/transformation_loader_spec.rb +11 -4
  22. data/spec/dsl/transformation_spec.rb +40 -4
  23. data/spec/feature/example_schema.rb +2 -137
  24. data/spec/feature/example_transform.rb +13 -6
  25. data/spec/feature/feature_spec.rb +119 -18
  26. data/spec/steps/assign_ids_spec.rb +23 -28
  27. data/spec/steps/create_stage_spec.rb +89 -0
  28. data/spec/steps/load_spec.rb +15 -23
  29. data/spec/steps/map_relations_spec.rb +32 -36
  30. data/spec/steps/table_diff_spec.rb +41 -45
  31. data/spec/steps/transform_spec.rb +2 -0
  32. data/spec/{dependency_resolver_spec.rb → task_runner/dependency_resolver_spec.rb} +0 -0
  33. metadata +22 -36
  34. data/lib/beetle_etl/state.rb +0 -67
  35. data/spec/import_spec.rb +0 -7
  36. data/spec/state_spec.rb +0 -124
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: b08e51fe6819079c6b8317636030d5d678b102cc
4
- data.tar.gz: 6afa19db3171f105fa275f99c82c07306aa9c568
3
+ metadata.gz: 9d792ed373d43d6c0ab7ec27241eaf18fb59736b
4
+ data.tar.gz: 7b08688ce87fbff7eea72b362773444ec4700b2c
5
5
  SHA512:
6
- metadata.gz: a915efc65e4450aa4ba7cec71a75b89082d6b65be4cd2e6db88ee8e540f9bc0d1db96afd18c787ac504309460447bfdbb20ab462e012ac503e8fe8cdecb65880
7
- data.tar.gz: afc74720f9875d7a447030bb14fbc4da90c1004b5c725fb55b0630bbc073d3d15560e6c5e2b7002951e1989eb4b63b956508411811c146986820aee2679a6462
6
+ metadata.gz: 7022ab2cc6a60f57f061d3b1acaa8a125da6db7e9cec76421189ae2fa2c4729b4134cb8246a8c14283aeb7769e1ef92a75f480e2d6be3a3e41ba2e8017b7ee2c
7
+ data.tar.gz: fa6e2f2f74cf1c4a6a03e53c2d76c962154e8d64f656d32690101bdd4dfa636dda9036213b29dfc675d07e9ed381a6527e94abb462f4f7eabfa3c3f11f279ee1
data/Gemfile CHANGED
@@ -2,3 +2,9 @@ source 'https://rubygems.org'
2
2
 
3
3
  # Specify your gem's dependencies in beetle_etl.gemspec
4
4
  gemspec
5
+
6
+ group :test do
7
+ gem 'rake'
8
+ gem 'codeclimate-test-reporter'
9
+ gem 'byebug'
10
+ end
@@ -18,12 +18,11 @@ Gem::Specification.new do |spec|
18
18
  spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
19
  spec.require_paths = ['lib']
20
20
 
21
- spec.add_runtime_dependency 'sequel', '>= 4.13.0'
21
+ spec.add_runtime_dependency 'sequel', '>= 4.0.0'
22
22
 
23
23
  spec.add_development_dependency 'bundler', '~> 1.6'
24
- spec.add_development_dependency 'rake'
25
- spec.add_development_dependency 'rspec', '~> 3.1.0'
26
- spec.add_development_dependency 'pg'
27
- spec.add_development_dependency 'codeclimate-test-reporter'
28
- spec.add_development_dependency 'activesupport'
24
+ spec.add_development_dependency 'rspec', '>= 3.0.0'
25
+ spec.add_development_dependency 'timecop', '>= 0.7.0'
26
+ spec.add_development_dependency 'pg', '>= 0.18.0'
27
+ spec.add_development_dependency 'activesupport', '>= 4.2.0'
29
28
  end
@@ -10,17 +10,20 @@ module BeetleETL
10
10
  require 'beetle_etl/dsl/transformation'
11
11
  require 'beetle_etl/dsl/transformation_loader'
12
12
 
13
+ require 'beetle_etl/naming'
14
+
13
15
  require 'beetle_etl/steps/step'
16
+ require 'beetle_etl/steps/create_stage'
14
17
  require 'beetle_etl/steps/transform'
15
18
  require 'beetle_etl/steps/map_relations'
16
19
  require 'beetle_etl/steps/table_diff'
17
20
  require 'beetle_etl/steps/assign_ids'
18
21
  require 'beetle_etl/steps/load'
22
+ require 'beetle_etl/steps/drop_stage'
19
23
 
20
24
  require 'beetle_etl/task_runner/dependency_resolver'
21
25
  require 'beetle_etl/task_runner/task_runner'
22
26
 
23
- require 'beetle_etl/state'
24
27
  require 'beetle_etl/import'
25
28
 
26
29
  class Configuration
@@ -29,23 +32,20 @@ module BeetleETL
29
32
  :database,
30
33
  :transformation_file,
31
34
  :stage_schema,
35
+ :public_schema,
32
36
  :external_source
33
37
 
34
38
  def initialize
35
- @stage_schema = 'stage'
39
+ @public_schema = 'public'
36
40
  end
37
41
  end
38
42
 
39
43
  class << self
40
44
 
41
45
  def import
42
- state.start_import
43
-
44
46
  begin
45
- Import.run
46
- state.mark_as_succeeded
47
+ Import.new.run
47
48
  rescue Exception => e
48
- state.mark_as_failed
49
49
  raise e
50
50
  ensure
51
51
  @database.disconnect if @database
@@ -71,13 +71,8 @@ module BeetleETL
71
71
  end
72
72
  end
73
73
 
74
- def state
75
- @state ||= State.new
76
- end
77
-
78
74
  def reset
79
75
  @config = nil
80
- @state = nil
81
76
  @database = nil
82
77
  end
83
78
 
@@ -1,11 +1,17 @@
1
1
  module BeetleETL
2
2
  class DSL
3
3
 
4
- attr_reader :relations, :query_string
4
+ attr_reader :column_names, :relations, :query_strings
5
5
 
6
6
  def initialize(table_name)
7
7
  @table_name = table_name
8
+ @column_names = []
8
9
  @relations = {}
10
+ @query_strings = []
11
+ end
12
+
13
+ def columns(*column_names)
14
+ @column_names = column_names
9
15
  end
10
16
 
11
17
  def references(foreign_table, on: foreign_key)
@@ -13,25 +19,18 @@ module BeetleETL
13
19
  end
14
20
 
15
21
  def query(query)
16
- @query_string = query
22
+ @query_strings << query
17
23
  end
18
24
 
25
+ # query helper methods
19
26
 
20
- def stage_table
21
- %Q("#{BeetleETL.config.stage_schema}"."#{@table_name}")
22
- end
23
-
24
- def external_source
25
- 'source'
27
+ def stage_table(table_name = nil)
28
+ BeetleETL::Naming.stage_table_name_sql(table_name || @table_name)
26
29
  end
27
30
 
28
31
  def combined_key(*args)
29
32
  %Q('[' || #{args.join(%q[ || ',' || ])} || ']')
30
33
  end
31
34
 
32
- def import_run_id
33
- BeetleETL.state.run_id
34
- end
35
-
36
35
  end
37
36
  end
@@ -5,9 +5,16 @@ module BeetleETL
5
5
 
6
6
  attr_reader :table_name
7
7
 
8
- def initialize(table_name, setup)
8
+ def initialize(table_name, setup, helpers = nil)
9
9
  @table_name = table_name
10
- (@parsed = DSL.new(table_name)).instance_eval(&setup)
10
+ @parsed = DSL.new(table_name).tap do |dsl|
11
+ dsl.instance_eval(&helpers) if helpers
12
+ dsl.instance_eval(&setup)
13
+ end
14
+ end
15
+
16
+ def column_names
17
+ @parsed.column_names.map(&:to_sym)
11
18
  end
12
19
 
13
20
  def relations
@@ -19,7 +26,7 @@ module BeetleETL
19
26
  end
20
27
 
21
28
  def query
22
- @parsed.query_string
29
+ @parsed.query_strings.join(';')
23
30
  end
24
31
 
25
32
  end
@@ -1,21 +1,29 @@
1
1
  module BeetleETL
2
- module TransformationLoader
3
- extend self
2
+ class TransformationLoader
4
3
 
5
- def load
4
+ def initialize
6
5
  @transformations = []
6
+ @helper_definitions = nil
7
+ end
7
8
 
9
+ def load
8
10
  File.open(BeetleETL.config.transformation_file, 'r') do |file|
9
11
  instance_eval file.read
10
12
  end
11
13
 
12
- @transformations
14
+ @transformations.map do |(table_name, setup)|
15
+ Transformation.new(table_name, setup, @helper_definitions)
16
+ end
13
17
  end
14
18
 
15
19
  private
16
20
 
17
21
  def import(table_name, &setup)
18
- @transformations << Transformation.new(table_name, setup)
22
+ @transformations << [table_name, setup]
23
+ end
24
+
25
+ def helpers(&helper_definitions)
26
+ @helper_definitions = helper_definitions
19
27
  end
20
28
 
21
29
  end
@@ -1,13 +1,15 @@
1
1
  module BeetleETL
2
- module Import
3
-
4
- extend self
2
+ class Import
5
3
 
6
4
  def run
7
5
  TaskRunner.new(data_steps).run
8
6
  BeetleETL.database.transaction do
9
7
  TaskRunner.new(load_steps).run
10
8
  end
9
+ rescue => e
10
+ raise e
11
+ ensure
12
+ TaskRunner.new(cleanup_steps).run
11
13
  end
12
14
 
13
15
  private
@@ -15,6 +17,7 @@ module BeetleETL
15
17
  def data_steps
16
18
  transformations.flat_map do |t|
17
19
  [
20
+ CreateStage.new(t.table_name, t.relations, t.column_names),
18
21
  Transform.new(t.table_name, t.dependencies, t.query),
19
22
  MapRelations.new(t.table_name, t.relations),
20
23
  TableDiff.new(t.table_name),
@@ -29,8 +32,12 @@ module BeetleETL
29
32
  end
30
33
  end
31
34
 
35
+ def cleanup_steps
36
+ transformations.map { |t| DropStage.new(t.table_name) }
37
+ end
38
+
32
39
  def transformations
33
- @transformations ||= TransformationLoader.load
40
+ @transformations ||= TransformationLoader.new.load
34
41
  end
35
42
 
36
43
  end
@@ -0,0 +1,37 @@
1
+ require 'digest'
2
+
3
+ module BeetleETL
4
+ module Naming
5
+
6
+ extend self
7
+
8
+ def stage_table_name(table_name = nil)
9
+ name = (table_name || @table_name).to_s
10
+ digest = Digest::MD5.hexdigest(name)
11
+ "#{name}-#{digest}"[0, 63]
12
+ end
13
+
14
+ def stage_table_name_sql(table_name = nil)
15
+ %Q("#{stage_table_name(table_name)}")
16
+ end
17
+
18
+ def public_table_name(table_name = nil)
19
+ name = (table_name || @table_name).to_s
20
+ [public_schema, name].compact.join('.')
21
+ end
22
+
23
+ def public_table_name_sql(table_name = nil)
24
+ name = (table_name || @table_name).to_s
25
+ public_table_name= [public_schema, name].compact.join('"."')
26
+ %Q("#{public_table_name}")
27
+ end
28
+
29
+ private
30
+
31
+ def public_schema
32
+ public_schema = BeetleETL.config.public_schema
33
+ public_schema != 'public' ? public_schema : nil
34
+ end
35
+
36
+ end
37
+ end
@@ -6,50 +6,26 @@ module BeetleETL
6
6
  end
7
7
 
8
8
  def run
9
- [
10
- Thread.new { assign_new_ids },
11
- Thread.new { map_existing_ids }
12
- ].each(&:join)
9
+ assign_new_ids
10
+ map_existing_ids
13
11
  end
14
12
 
15
13
  def assign_new_ids
16
- stage_table
17
- .where(
18
- import_run_id: run_id,
19
- transition: 'CREATE'
20
- )
21
- .update(
22
- id: Sequel.function(:NEXTVAL, "public.#{table_name}_id_seq")
23
- )
14
+ database.execute <<-SQL
15
+ UPDATE #{stage_table_name_sql}
16
+ SET id = nextval('#{table_name}_id_seq')
17
+ WHERE transition = 'CREATE'
18
+ SQL
24
19
  end
25
20
 
26
21
  def map_existing_ids
27
- stage_table
28
- .from(stage_table_identifier, public_table_identifier)
29
- .where(
30
- stage__import_run_id: run_id,
31
- stage__transition: %w(KEEP UPDATE DELETE UNDELETE),
32
- stage__external_id: :public__external_id
33
- )
34
- .update(id: :public__id)
35
- end
36
-
37
- private
38
-
39
- def stage_table_identifier
40
- :"#{stage_schema}__#{table_name}___stage"
41
- end
42
-
43
- def stage_table
44
- database[stage_table_identifier]
45
- end
46
-
47
- def public_table_identifier
48
- :"#{table_name}___public"
49
- end
50
-
51
- def public_table
52
- database[public_table_identifier]
22
+ database.execute <<-SQL
23
+ UPDATE #{stage_table_name_sql} stage
24
+ SET id = public.id
25
+ FROM #{public_table_name_sql} public
26
+ WHERE stage.transition IN ('KEEP', 'UPDATE', 'DELETE', 'UNDELETE')
27
+ AND stage.external_id = public.external_id
28
+ SQL
53
29
  end
54
30
 
55
31
  end
@@ -0,0 +1,59 @@
1
+ module BeetleETL
2
+ class CreateStage < Step
3
+
4
+ def initialize(table_name, relations, column_names)
5
+ super(table_name)
6
+ @relations = relations
7
+ @column_names = column_names
8
+ end
9
+
10
+ def dependencies
11
+ Set.new
12
+ end
13
+
14
+ def run
15
+ database.execute <<-SQL
16
+ CREATE TABLE #{stage_table_name_sql} (
17
+ id integer,
18
+ external_id character varying(255),
19
+ transition character varying(255),
20
+
21
+ #{[
22
+ payload_column_definitions,
23
+ relation_column_definitions
24
+ ].compact.join(',')}
25
+ )
26
+ SQL
27
+ end
28
+
29
+ private
30
+
31
+ def payload_column_definitions
32
+ definitions = (@column_names - @relations.keys).map do |column_name|
33
+ "#{column_name} #{column_type(column_name)}"
34
+ end
35
+ definitions.join(',') if definitions.any?
36
+ end
37
+
38
+ def relation_column_definitions
39
+ definitions = @relations.map do |foreign_key_name, table|
40
+ <<-SQL
41
+ #{foreign_key_name} integer,
42
+ external_#{foreign_key_name} character varying(255)
43
+ SQL
44
+ end
45
+ definitions.join(',') if definitions.any?
46
+ end
47
+
48
+ def column_type(column_name)
49
+ @column_types ||= Hash[database.schema(public_table_name.to_sym)]
50
+ .reduce({}) do |acc, (name, schema)|
51
+ acc[name.to_sym] = schema.fetch(:db_type)
52
+ acc
53
+ end
54
+
55
+ @column_types[column_name]
56
+ end
57
+
58
+ end
59
+ end
@@ -0,0 +1,15 @@
1
+ module BeetleETL
2
+ class DropStage < Step
3
+
4
+ def dependencies
5
+ Set.new
6
+ end
7
+
8
+ def run
9
+ database.execute <<-SQL
10
+ DROP TABLE IF EXISTS #{stage_table_name_sql}
11
+ SQL
12
+ end
13
+
14
+ end
15
+ end
@@ -2,93 +2,78 @@ module BeetleETL
2
2
  class Load < Step
3
3
 
4
4
  IMPORTER_COLUMNS = %i[
5
- import_run_id
6
5
  external_source
7
6
  transition
8
7
  ]
9
8
 
10
- attr_reader :relations
11
-
12
9
  def initialize(table_name, relations)
13
10
  super(table_name)
14
11
  @relations = relations
15
12
  end
16
13
 
17
14
  def run
18
- %w(create update delete undelete).map do |transition|
19
- Thread.new { public_send(:"load_#{transition}") }
20
- end.each(&:join)
15
+ %w(create update delete undelete).each do |transition|
16
+ public_send(:"load_#{transition}")
17
+ end
21
18
  end
22
19
 
23
20
  def dependencies
24
- relations.values.map { |d| Load.step_name(d) }.to_set
21
+ @relations.values.map { |d| Load.step_name(d) }.to_set
25
22
  end
26
23
 
27
24
  def load_create
28
25
  just_now = now
29
- database[table_name].import(
30
- data_columns + [:external_source, :created_at, :updated_at],
31
- database[:"#{stage_schema}__#{table_name}"]
32
- .select(*data_columns)
33
- .where(
34
- import_run_id: run_id,
35
- transition: 'CREATE'
36
- )
37
- .select_more(external_source, just_now, just_now)
38
- )
26
+
27
+ database.execute <<-SQL
28
+ INSERT INTO #{public_table_name_sql}
29
+ (#{data_columns.join(', ')}, external_source, created_at, updated_at)
30
+ SELECT
31
+ #{data_columns.join(', ')},
32
+ '#{external_source}',
33
+ '#{just_now}',
34
+ '#{just_now}'
35
+ FROM #{stage_table_name_sql}
36
+ WHERE transition = 'CREATE'
37
+ SQL
39
38
  end
40
39
 
41
40
  def load_update
42
- updates = updatable_columns.reduce({updated_at: now}) do |acc, column|
43
- acc[column] = :"stage__#{column}"
44
- acc
45
- end
46
-
47
- database.from(
48
- :"#{table_name}___public",
49
- :"#{stage_schema}__#{table_name}___stage"
50
- )
51
- .where(
52
- stage__id: :public__id,
53
- stage__transition: 'UPDATE',
54
- stage__import_run_id: run_id,
55
- )
56
- .update(updates)
41
+ database.execute <<-SQL
42
+ UPDATE #{public_table_name_sql} public
43
+ SET
44
+ #{updatable_columns.map { |c| %Q("#{c}" = stage."#{c}") }.join(',')},
45
+ "updated_at" = '#{now}'
46
+ FROM #{stage_table_name_sql} stage
47
+ WHERE stage.id = public.id
48
+ AND stage.transition = 'UPDATE'
49
+ SQL
57
50
  end
58
51
 
59
52
  def load_delete
60
53
  just_now = now
61
- database.from(
62
- :"#{table_name}___public",
63
- :"#{stage_schema}__#{table_name}___stage"
64
- )
65
- .where(
66
- stage__id: :public__id,
67
- stage__transition: 'DELETE',
68
- stage__import_run_id: run_id,
69
- )
70
- .update(
71
- updated_at: just_now,
72
- deleted_at: just_now,
73
- )
54
+
55
+ database.execute <<-SQL
56
+ UPDATE #{public_table_name_sql} public
57
+ SET
58
+ updated_at = '#{just_now}',
59
+ deleted_at = '#{just_now}'
60
+ FROM #{stage_table_name_sql} stage
61
+ WHERE stage.id = public.id
62
+ AND stage.transition = 'DELETE'
63
+ SQL
74
64
  end
75
65
 
76
66
  def load_undelete
77
- updates = updatable_columns.reduce({updated_at: now, deleted_at: nil}) do |acc, column|
78
- acc[column] = :"stage__#{column}"
79
- acc
80
- end
81
-
82
- database.from(
83
- :"#{table_name}___public",
84
- :"#{stage_schema}__#{table_name}___stage"
85
- )
86
- .where(
87
- stage__id: :public__id,
88
- stage__transition: 'UNDELETE',
89
- stage__import_run_id: run_id,
90
- )
91
- .update(updates)
67
+ database.execute <<-SQL
68
+ UPDATE #{public_table_name_sql} public
69
+ SET
70
+ #{updatable_columns.map { |c| %Q("#{c}" = stage."#{c}") }.join(',')},
71
+ updated_at = '#{now}',
72
+ deleted_at = NULL
73
+ FROM #{stage_table_name_sql} stage
74
+ WHERE stage.id = public.id
75
+ AND stage.transition = 'UNDELETE'
76
+ SQL
92
77
  end
93
78
 
94
79
  private
@@ -98,7 +83,7 @@ module BeetleETL
98
83
  end
99
84
 
100
85
  def table_columns
101
- @table_columns ||= database[:"#{stage_schema}__#{table_name}"].columns
86
+ @table_columns ||= database[stage_table_name.to_sym].columns
102
87
  end
103
88
 
104
89
  def ignored_columns