beetle_etl 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (45) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +23 -0
  3. data/.travis.yml +12 -0
  4. data/Gemfile +4 -0
  5. data/LICENSE.txt +22 -0
  6. data/README.md +31 -0
  7. data/Rakefile +5 -0
  8. data/beetle_etl.gemspec +30 -0
  9. data/lib/beetle_etl.rb +85 -0
  10. data/lib/beetle_etl/dsl/dsl.rb +37 -0
  11. data/lib/beetle_etl/dsl/transformation.rb +26 -0
  12. data/lib/beetle_etl/dsl/transformation_loader.rb +22 -0
  13. data/lib/beetle_etl/import.rb +37 -0
  14. data/lib/beetle_etl/state.rb +67 -0
  15. data/lib/beetle_etl/steps/assign_ids.rb +54 -0
  16. data/lib/beetle_etl/steps/load.rb +108 -0
  17. data/lib/beetle_etl/steps/map_relations.rb +31 -0
  18. data/lib/beetle_etl/steps/step.rb +42 -0
  19. data/lib/beetle_etl/steps/table_diff.rb +155 -0
  20. data/lib/beetle_etl/steps/transform.rb +22 -0
  21. data/lib/beetle_etl/task_runner/dependency_resolver.rb +39 -0
  22. data/lib/beetle_etl/task_runner/task_runner.rb +64 -0
  23. data/lib/beetle_etl/version.rb +3 -0
  24. data/script/postgres +12 -0
  25. data/spec/beetle_etl_spec.rb +70 -0
  26. data/spec/dependency_resolver_spec.rb +57 -0
  27. data/spec/dsl/dsl_spec.rb +44 -0
  28. data/spec/dsl/transformation_loader_spec.rb +51 -0
  29. data/spec/dsl/transformation_spec.rb +54 -0
  30. data/spec/feature/example_schema.rb +192 -0
  31. data/spec/feature/example_transform.rb +37 -0
  32. data/spec/feature/feature_spec.rb +48 -0
  33. data/spec/import_spec.rb +7 -0
  34. data/spec/spec_helper.rb +25 -0
  35. data/spec/state_spec.rb +124 -0
  36. data/spec/steps/assign_ids_spec.rb +107 -0
  37. data/spec/steps/load_spec.rb +148 -0
  38. data/spec/steps/map_relations_spec.rb +92 -0
  39. data/spec/steps/step_spec.rb +37 -0
  40. data/spec/steps/table_diff_spec.rb +183 -0
  41. data/spec/steps/transform_spec.rb +34 -0
  42. data/spec/support/database.yml.example +9 -0
  43. data/spec/support/database.yml.travis +4 -0
  44. data/spec/support/database_helpers.rb +58 -0
  45. metadata +220 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: ebb61022b0e58e217f1b215486a993c7e43799a2
4
+ data.tar.gz: e00ad7086cf5be7c4cadf520cbd21c1ebd47202d
5
+ SHA512:
6
+ metadata.gz: 8e3c2be8adf3cb65807fddb95d1b09d1de9064f586f3e0aefa6026ff3e7b572b8f9653c31ca759114228e6cb6470495f21b74f7470ad6fb7745136210b87f31e
7
+ data.tar.gz: 3d91601c914486564b8db8e3afdef62c90b04b54fe3633321547de3ec437ee1f12ead442265e84561c4578d2dae482be2cebc85dfd610c76e3db877239d4a2d5
data/.gitignore ADDED
@@ -0,0 +1,23 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ spec/support/database.yml
16
+ test/tmp
17
+ test/version_tmp
18
+ tmp
19
+ *.bundle
20
+ *.so
21
+ *.o
22
+ *.a
23
+ mkmf.log
data/.travis.yml ADDED
@@ -0,0 +1,12 @@
1
+ language: ruby
2
+ rvm:
3
+ - 2.0.0
4
+ - 2.1.2
5
+ addons:
6
+ postgresql: "9.3"
7
+ code_climate:
8
+ repo_token: dd18697b0acb6be343db62982b753b72676e8342701cc0442121de2d12ee6549
9
+
10
+ before_script:
11
+ - psql -c 'create database travis_ci_test;' -U postgres
12
+ - cp spec/support/database.yml.travis spec/support/database.yml
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in beetle_etl.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2014 Luciano Maiwald
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,31 @@
1
+ # BeetleETL
2
+ [![Build Status](https://travis-ci.org/maiwald/beetle_etl.svg?branch=master)](https://travis-ci.org/maiwald/beetle_etl)
3
+ [![Code Climate](https://codeclimate.com/github/maiwald/beetle_etl.png)](https://codeclimate.com/github/maiwald/beetle_etl)
4
+
5
+ TODO: Write a gem description
6
+
7
+ ## Installation
8
+
9
+ Add this line to your application's Gemfile:
10
+
11
+ gem 'beetle_etl'
12
+
13
+ And then execute:
14
+
15
+ $ bundle
16
+
17
+ Or install it yourself as:
18
+
19
+ $ gem install beetle_etl
20
+
21
+ ## Usage
22
+
23
+ TODO: Write usage instructions here
24
+
25
+ ## Contributing
26
+
27
+ 1. Fork it ( https://github.com/maiwald/beetle_etl/fork )
28
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
29
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
30
+ 4. Push to the branch (`git push origin my-new-feature`)
31
+ 5. Create a new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,5 @@
1
+ require "bundler/gem_tasks"
2
+
3
+ require 'rspec/core/rake_task'
4
+ task :default => :spec
5
+ RSpec::Core::RakeTask.new
@@ -0,0 +1,30 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'beetle_etl/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = 'beetle_etl'
8
+ spec.version = BeetleETL::VERSION
9
+ spec.authors = ['Luciano Maiwald']
10
+ spec.email = ['luciano.maiwald@gmail.com']
11
+ spec.summary = %q{BeetleETL helps you with your recurring ETL imports.}
12
+ spec.description = %q{Taking care of synchronizing external data with referential data in your application.}
13
+ spec.homepage = 'https://github.com/maiwald/beetle_etl'
14
+ spec.license = 'MIT'
15
+
16
+ spec.files = `git ls-files -z`.split("\x0")
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ['lib']
20
+
21
+ spec.add_runtime_dependency 'sequel', '>= 4.13.0'
22
+ spec.add_runtime_dependency 'celluloid', '>= 0.15.2'
23
+
24
+ spec.add_development_dependency 'bundler', '~> 1.6'
25
+ spec.add_development_dependency 'rake'
26
+ spec.add_development_dependency 'rspec', '~> 3.0.0'
27
+ spec.add_development_dependency 'pg'
28
+ spec.add_development_dependency 'codeclimate-test-reporter'
29
+ spec.add_development_dependency 'activesupport'
30
+ end
data/lib/beetle_etl.rb ADDED
@@ -0,0 +1,85 @@
1
+ require 'beetle_etl/version'
2
+
3
+ require 'sequel'
4
+
5
+ module BeetleETL
6
+
7
+ InvalidConfigurationError = Class.new(StandardError)
8
+
9
+ require 'beetle_etl/dsl/dsl'
10
+ require 'beetle_etl/dsl/transformation'
11
+ require 'beetle_etl/dsl/transformation_loader'
12
+
13
+ require 'beetle_etl/steps/step'
14
+ require 'beetle_etl/steps/transform'
15
+ require 'beetle_etl/steps/map_relations'
16
+ require 'beetle_etl/steps/table_diff'
17
+ require 'beetle_etl/steps/assign_ids'
18
+ require 'beetle_etl/steps/load'
19
+
20
+ require 'beetle_etl/task_runner/dependency_resolver'
21
+ require 'beetle_etl/task_runner/task_runner'
22
+
23
+ require 'beetle_etl/state'
24
+ require 'beetle_etl/import'
25
+
26
+ class Configuration
27
+ attr_accessor \
28
+ :database_config,
29
+ :database,
30
+ :transformation_file,
31
+ :stage_schema,
32
+ :external_source
33
+
34
+ def initialize
35
+ @stage_schema = 'stage'
36
+ end
37
+ end
38
+
39
+ class << self
40
+
41
+ def import
42
+ state.start_import
43
+
44
+ begin
45
+ Import.run
46
+ state.mark_as_succeeded
47
+ rescue Exception => e
48
+ state.mark_as_failed
49
+ raise e
50
+ ensure
51
+ @database.disconnect if @database
52
+ end
53
+ end
54
+
55
+ def configure
56
+ yield(config)
57
+ end
58
+
59
+ def config
60
+ @config ||= Configuration.new
61
+ end
62
+
63
+ def database
64
+ if config.database
65
+ config.database
66
+ elsif config.database_config
67
+ @database ||= Sequel.connect(config.database_config)
68
+ else
69
+ msg = "Either Sequel connection database_config or a Sequel Database object required"
70
+ raise InvalidConfigurationError.new(msg)
71
+ end
72
+ end
73
+
74
+ def state
75
+ @state ||= State.new
76
+ end
77
+
78
+ def reset
79
+ @config = nil
80
+ @state = nil
81
+ @database = nil
82
+ end
83
+
84
+ end
85
+ end
@@ -0,0 +1,37 @@
1
+ module BeetleETL
2
+ class DSL
3
+
4
+ attr_reader :relations, :query_string
5
+
6
+ def initialize(table_name)
7
+ @table_name = table_name
8
+ @relations = {}
9
+ end
10
+
11
+ def references(foreign_table, on: foreign_key)
12
+ @relations[on] = foreign_table
13
+ end
14
+
15
+ def query(query)
16
+ @query_string = query
17
+ end
18
+
19
+
20
+ def stage_table
21
+ %Q("#{BeetleETL.config.stage_schema}"."#{@table_name}")
22
+ end
23
+
24
+ def external_source
25
+ 'source'
26
+ end
27
+
28
+ def combined_key(*args)
29
+ %Q('[' || #{args.join(%q[ || ',' || ])} || ']')
30
+ end
31
+
32
+ def import_run_id
33
+ BeetleETL.state.run_id
34
+ end
35
+
36
+ end
37
+ end
@@ -0,0 +1,26 @@
1
+ require 'set'
2
+
3
+ module BeetleETL
4
+ class Transformation
5
+
6
+ attr_reader :table_name
7
+
8
+ def initialize(table_name, setup)
9
+ @table_name = table_name
10
+ (@parsed = DSL.new(table_name)).instance_eval(&setup)
11
+ end
12
+
13
+ def relations
14
+ @parsed.relations
15
+ end
16
+
17
+ def dependencies
18
+ relations.values.to_set
19
+ end
20
+
21
+ def query
22
+ @parsed.query_string
23
+ end
24
+
25
+ end
26
+ end
@@ -0,0 +1,22 @@
1
+ module BeetleETL
2
+ module TransformationLoader
3
+ extend self
4
+
5
+ def load
6
+ @transformations = []
7
+
8
+ File.open(BeetleETL.config.transformation_file, 'r') do |file|
9
+ instance_eval file.read
10
+ end
11
+
12
+ @transformations
13
+ end
14
+
15
+ private
16
+
17
+ def import(table_name, &setup)
18
+ @transformations << Transformation.new(table_name, setup)
19
+ end
20
+
21
+ end
22
+ end
@@ -0,0 +1,37 @@
1
+ module BeetleETL
2
+ module Import
3
+
4
+ extend self
5
+
6
+ def run
7
+ TaskRunner.run(data_steps)
8
+ BeetleETL.database.transaction do
9
+ load_steps.each(&:run)
10
+ end
11
+ end
12
+
13
+ private
14
+
15
+ def data_steps
16
+ transformations.map do |t|
17
+ [
18
+ Transform.new(t.table_name, t.dependencies, t.query),
19
+ MapRelations.new(t.table_name, t.relations),
20
+ TableDiff.new(t.table_name),
21
+ AssignIds.new(t.table_name),
22
+ ]
23
+ end.flatten
24
+ end
25
+
26
+ def load_steps
27
+ transformations.map do |t|
28
+ Load.new(t.table_name)
29
+ end
30
+ end
31
+
32
+ def transformations
33
+ @transformations ||= TransformationLoader.load
34
+ end
35
+
36
+ end
37
+ end
@@ -0,0 +1,67 @@
1
+ module BeetleETL
2
+
3
+ ImportAleadyRunning = Class.new(StandardError)
4
+ ImportSchemaNotFound = Class.new(StandardError)
5
+ ImportNotRunning = Class.new(StandardError)
6
+
7
+ class State
8
+
9
+ def start_import
10
+ raise ImportAleadyRunning if import_already_running?
11
+
12
+ @run_id = import_runs_dataset.insert(
13
+ state: 'RUNNING',
14
+ started_at: now
15
+ )
16
+ end
17
+
18
+ def mark_as_succeeded
19
+ mark_as('SUCCEEDED')
20
+ end
21
+
22
+ def mark_as_failed
23
+ mark_as('FAILED')
24
+ end
25
+
26
+ def run_id
27
+ raise ImportNotRunning if @run_id.nil?
28
+ @run_id
29
+ end
30
+
31
+ def last_run_id
32
+ last_import = import_runs_dataset.
33
+ select(:id).
34
+ where(state: 'SUCCEEDED').
35
+ order(Sequel.desc(:id)).
36
+ first
37
+
38
+ last_import.nil? ? nil : last_import[:id]
39
+ end
40
+
41
+ private
42
+
43
+ def import_runs_table
44
+ "#{BeetleETL.config.stage_schema}__import_runs".to_sym
45
+ end
46
+
47
+ def import_already_running?
48
+ import_runs_dataset.where(state: 'RUNNING').count > 0
49
+ end
50
+
51
+ def now
52
+ Time.now
53
+ end
54
+
55
+ def mark_as(state)
56
+ import_runs_dataset.filter(id: run_id).update(
57
+ state: state,
58
+ finished_at: now
59
+ )
60
+ end
61
+
62
+ def import_runs_dataset
63
+ BeetleETL.database[import_runs_table]
64
+ end
65
+
66
+ end
67
+ end
@@ -0,0 +1,54 @@
1
+ module BeetleETL
2
+ class AssignIds < Step
3
+
4
+ def dependencies
5
+ [TableDiff.step_name(table_name)].to_set
6
+ end
7
+
8
+ def run
9
+ assign_new_ids
10
+ map_existing_ids
11
+ end
12
+
13
+ def assign_new_ids
14
+ stage_table
15
+ .where(
16
+ import_run_id: run_id,
17
+ transition: 'CREATE'
18
+ )
19
+ .update(
20
+ id: Sequel.function(:NEXTVAL, "public.#{table_name}_id_seq")
21
+ )
22
+ end
23
+
24
+ def map_existing_ids
25
+ stage_table
26
+ .from(stage_table_identifier, public_table_identifier)
27
+ .where(
28
+ stage__import_run_id: run_id,
29
+ stage__transition: %w(KEEP UPDATE DELETE UNDELETE),
30
+ stage__external_id: :public__external_id
31
+ )
32
+ .update(id: :public__id)
33
+ end
34
+
35
+ private
36
+
37
+ def stage_table_identifier
38
+ :"#{stage_schema}__#{table_name}___stage"
39
+ end
40
+
41
+ def stage_table
42
+ database[stage_table_identifier]
43
+ end
44
+
45
+ def public_table_identifier
46
+ :"#{table_name}___public"
47
+ end
48
+
49
+ def public_table
50
+ database[public_table_identifier]
51
+ end
52
+
53
+ end
54
+ end