beetle_etl 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +23 -0
  3. data/.travis.yml +12 -0
  4. data/Gemfile +4 -0
  5. data/LICENSE.txt +22 -0
  6. data/README.md +31 -0
  7. data/Rakefile +5 -0
  8. data/beetle_etl.gemspec +30 -0
  9. data/lib/beetle_etl.rb +85 -0
  10. data/lib/beetle_etl/dsl/dsl.rb +37 -0
  11. data/lib/beetle_etl/dsl/transformation.rb +26 -0
  12. data/lib/beetle_etl/dsl/transformation_loader.rb +22 -0
  13. data/lib/beetle_etl/import.rb +37 -0
  14. data/lib/beetle_etl/state.rb +67 -0
  15. data/lib/beetle_etl/steps/assign_ids.rb +54 -0
  16. data/lib/beetle_etl/steps/load.rb +108 -0
  17. data/lib/beetle_etl/steps/map_relations.rb +31 -0
  18. data/lib/beetle_etl/steps/step.rb +42 -0
  19. data/lib/beetle_etl/steps/table_diff.rb +155 -0
  20. data/lib/beetle_etl/steps/transform.rb +22 -0
  21. data/lib/beetle_etl/task_runner/dependency_resolver.rb +39 -0
  22. data/lib/beetle_etl/task_runner/task_runner.rb +64 -0
  23. data/lib/beetle_etl/version.rb +3 -0
  24. data/script/postgres +12 -0
  25. data/spec/beetle_etl_spec.rb +70 -0
  26. data/spec/dependency_resolver_spec.rb +57 -0
  27. data/spec/dsl/dsl_spec.rb +44 -0
  28. data/spec/dsl/transformation_loader_spec.rb +51 -0
  29. data/spec/dsl/transformation_spec.rb +54 -0
  30. data/spec/feature/example_schema.rb +192 -0
  31. data/spec/feature/example_transform.rb +37 -0
  32. data/spec/feature/feature_spec.rb +48 -0
  33. data/spec/import_spec.rb +7 -0
  34. data/spec/spec_helper.rb +25 -0
  35. data/spec/state_spec.rb +124 -0
  36. data/spec/steps/assign_ids_spec.rb +107 -0
  37. data/spec/steps/load_spec.rb +148 -0
  38. data/spec/steps/map_relations_spec.rb +92 -0
  39. data/spec/steps/step_spec.rb +37 -0
  40. data/spec/steps/table_diff_spec.rb +183 -0
  41. data/spec/steps/transform_spec.rb +34 -0
  42. data/spec/support/database.yml.example +9 -0
  43. data/spec/support/database.yml.travis +4 -0
  44. data/spec/support/database_helpers.rb +58 -0
  45. metadata +220 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: ebb61022b0e58e217f1b215486a993c7e43799a2
4
+ data.tar.gz: e00ad7086cf5be7c4cadf520cbd21c1ebd47202d
5
+ SHA512:
6
+ metadata.gz: 8e3c2be8adf3cb65807fddb95d1b09d1de9064f586f3e0aefa6026ff3e7b572b8f9653c31ca759114228e6cb6470495f21b74f7470ad6fb7745136210b87f31e
7
+ data.tar.gz: 3d91601c914486564b8db8e3afdef62c90b04b54fe3633321547de3ec437ee1f12ead442265e84561c4578d2dae482be2cebc85dfd610c76e3db877239d4a2d5
data/.gitignore ADDED
@@ -0,0 +1,23 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ spec/support/database.yml
16
+ test/tmp
17
+ test/version_tmp
18
+ tmp
19
+ *.bundle
20
+ *.so
21
+ *.o
22
+ *.a
23
+ mkmf.log
data/.travis.yml ADDED
@@ -0,0 +1,12 @@
1
+ language: ruby
2
+ rvm:
3
+ - 2.0.0
4
+ - 2.1.2
5
+ addons:
6
+ postgresql: "9.3"
7
+ code_climate:
8
+ repo_token: dd18697b0acb6be343db62982b753b72676e8342701cc0442121de2d12ee6549
9
+
10
+ before_script:
11
+ - psql -c 'create database travis_ci_test;' -U postgres
12
+ - cp spec/support/database.yml.travis spec/support/database.yml
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in beetle_etl.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2014 Luciano Maiwald
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,31 @@
1
+ # BeetleETL
2
+ [![Build Status](https://travis-ci.org/maiwald/beetle_etl.svg?branch=master)](https://travis-ci.org/maiwald/beetle_etl)
3
+ [![Code Climate](https://codeclimate.com/github/maiwald/beetle_etl.png)](https://codeclimate.com/github/maiwald/beetle_etl)
4
+
5
+ TODO: Write a gem description
6
+
7
+ ## Installation
8
+
9
+ Add this line to your application's Gemfile:
10
+
11
+ gem 'beetle_etl'
12
+
13
+ And then execute:
14
+
15
+ $ bundle
16
+
17
+ Or install it yourself as:
18
+
19
+ $ gem install beetle_etl
20
+
21
+ ## Usage
22
+
23
+ TODO: Write usage instructions here
24
+
25
+ ## Contributing
26
+
27
+ 1. Fork it ( https://github.com/maiwald/beetle_etl/fork )
28
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
29
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
30
+ 4. Push to the branch (`git push origin my-new-feature`)
31
+ 5. Create a new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,5 @@
1
+ require "bundler/gem_tasks"
2
+
3
+ require 'rspec/core/rake_task'
4
+ task :default => :spec
5
+ RSpec::Core::RakeTask.new
@@ -0,0 +1,30 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'beetle_etl/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = 'beetle_etl'
8
+ spec.version = BeetleETL::VERSION
9
+ spec.authors = ['Luciano Maiwald']
10
+ spec.email = ['luciano.maiwald@gmail.com']
11
+ spec.summary = %q{BeetleETL helps you with your recurring ETL imports.}
12
+ spec.description = %q{Taking care of synchronizing external data with referential data in your application.}
13
+ spec.homepage = 'https://github.com/maiwald/beetle_etl'
14
+ spec.license = 'MIT'
15
+
16
+ spec.files = `git ls-files -z`.split("\x0")
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ['lib']
20
+
21
+ spec.add_runtime_dependency 'sequel', '>= 4.13.0'
22
+ spec.add_runtime_dependency 'celluloid', '>= 0.15.2'
23
+
24
+ spec.add_development_dependency 'bundler', '~> 1.6'
25
+ spec.add_development_dependency 'rake'
26
+ spec.add_development_dependency 'rspec', '~> 3.0.0'
27
+ spec.add_development_dependency 'pg'
28
+ spec.add_development_dependency 'codeclimate-test-reporter'
29
+ spec.add_development_dependency 'activesupport'
30
+ end
data/lib/beetle_etl.rb ADDED
@@ -0,0 +1,85 @@
1
+ require 'beetle_etl/version'
2
+
3
+ require 'sequel'
4
+
5
+ module BeetleETL
6
+
7
+ InvalidConfigurationError = Class.new(StandardError)
8
+
9
+ require 'beetle_etl/dsl/dsl'
10
+ require 'beetle_etl/dsl/transformation'
11
+ require 'beetle_etl/dsl/transformation_loader'
12
+
13
+ require 'beetle_etl/steps/step'
14
+ require 'beetle_etl/steps/transform'
15
+ require 'beetle_etl/steps/map_relations'
16
+ require 'beetle_etl/steps/table_diff'
17
+ require 'beetle_etl/steps/assign_ids'
18
+ require 'beetle_etl/steps/load'
19
+
20
+ require 'beetle_etl/task_runner/dependency_resolver'
21
+ require 'beetle_etl/task_runner/task_runner'
22
+
23
+ require 'beetle_etl/state'
24
+ require 'beetle_etl/import'
25
+
26
+ class Configuration
27
+ attr_accessor \
28
+ :database_config,
29
+ :database,
30
+ :transformation_file,
31
+ :stage_schema,
32
+ :external_source
33
+
34
+ def initialize
35
+ @stage_schema = 'stage'
36
+ end
37
+ end
38
+
39
+ class << self
40
+
41
+ def import
42
+ state.start_import
43
+
44
+ begin
45
+ Import.run
46
+ state.mark_as_succeeded
47
+ rescue Exception => e
48
+ state.mark_as_failed
49
+ raise e
50
+ ensure
51
+ @database.disconnect if @database
52
+ end
53
+ end
54
+
55
+ def configure
56
+ yield(config)
57
+ end
58
+
59
+ def config
60
+ @config ||= Configuration.new
61
+ end
62
+
63
+ def database
64
+ if config.database
65
+ config.database
66
+ elsif config.database_config
67
+ @database ||= Sequel.connect(config.database_config)
68
+ else
69
+ msg = "Either Sequel connection database_config or a Sequel Database object required"
70
+ raise InvalidConfigurationError.new(msg)
71
+ end
72
+ end
73
+
74
+ def state
75
+ @state ||= State.new
76
+ end
77
+
78
+ def reset
79
+ @config = nil
80
+ @state = nil
81
+ @database = nil
82
+ end
83
+
84
+ end
85
+ end
@@ -0,0 +1,37 @@
1
+ module BeetleETL
2
+ class DSL
3
+
4
+ attr_reader :relations, :query_string
5
+
6
+ def initialize(table_name)
7
+ @table_name = table_name
8
+ @relations = {}
9
+ end
10
+
11
+ def references(foreign_table, on: foreign_key)
12
+ @relations[on] = foreign_table
13
+ end
14
+
15
+ def query(query)
16
+ @query_string = query
17
+ end
18
+
19
+
20
+ def stage_table
21
+ %Q("#{BeetleETL.config.stage_schema}"."#{@table_name}")
22
+ end
23
+
24
+ def external_source
25
+ 'source'
26
+ end
27
+
28
+ def combined_key(*args)
29
+ %Q('[' || #{args.join(%q[ || ',' || ])} || ']')
30
+ end
31
+
32
+ def import_run_id
33
+ BeetleETL.state.run_id
34
+ end
35
+
36
+ end
37
+ end
@@ -0,0 +1,26 @@
1
+ require 'set'
2
+
3
+ module BeetleETL
4
+ class Transformation
5
+
6
+ attr_reader :table_name
7
+
8
+ def initialize(table_name, setup)
9
+ @table_name = table_name
10
+ (@parsed = DSL.new(table_name)).instance_eval(&setup)
11
+ end
12
+
13
+ def relations
14
+ @parsed.relations
15
+ end
16
+
17
+ def dependencies
18
+ relations.values.to_set
19
+ end
20
+
21
+ def query
22
+ @parsed.query_string
23
+ end
24
+
25
+ end
26
+ end
@@ -0,0 +1,22 @@
1
+ module BeetleETL
2
+ module TransformationLoader
3
+ extend self
4
+
5
+ def load
6
+ @transformations = []
7
+
8
+ File.open(BeetleETL.config.transformation_file, 'r') do |file|
9
+ instance_eval file.read
10
+ end
11
+
12
+ @transformations
13
+ end
14
+
15
+ private
16
+
17
+ def import(table_name, &setup)
18
+ @transformations << Transformation.new(table_name, setup)
19
+ end
20
+
21
+ end
22
+ end
@@ -0,0 +1,37 @@
1
+ module BeetleETL
2
+ module Import
3
+
4
+ extend self
5
+
6
+ def run
7
+ TaskRunner.run(data_steps)
8
+ BeetleETL.database.transaction do
9
+ load_steps.each(&:run)
10
+ end
11
+ end
12
+
13
+ private
14
+
15
+ def data_steps
16
+ transformations.map do |t|
17
+ [
18
+ Transform.new(t.table_name, t.dependencies, t.query),
19
+ MapRelations.new(t.table_name, t.relations),
20
+ TableDiff.new(t.table_name),
21
+ AssignIds.new(t.table_name),
22
+ ]
23
+ end.flatten
24
+ end
25
+
26
+ def load_steps
27
+ transformations.map do |t|
28
+ Load.new(t.table_name)
29
+ end
30
+ end
31
+
32
+ def transformations
33
+ @transformations ||= TransformationLoader.load
34
+ end
35
+
36
+ end
37
+ end
@@ -0,0 +1,67 @@
1
+ module BeetleETL
2
+
3
+ ImportAleadyRunning = Class.new(StandardError)
4
+ ImportSchemaNotFound = Class.new(StandardError)
5
+ ImportNotRunning = Class.new(StandardError)
6
+
7
+ class State
8
+
9
+ def start_import
10
+ raise ImportAleadyRunning if import_already_running?
11
+
12
+ @run_id = import_runs_dataset.insert(
13
+ state: 'RUNNING',
14
+ started_at: now
15
+ )
16
+ end
17
+
18
+ def mark_as_succeeded
19
+ mark_as('SUCCEEDED')
20
+ end
21
+
22
+ def mark_as_failed
23
+ mark_as('FAILED')
24
+ end
25
+
26
+ def run_id
27
+ raise ImportNotRunning if @run_id.nil?
28
+ @run_id
29
+ end
30
+
31
+ def last_run_id
32
+ last_import = import_runs_dataset.
33
+ select(:id).
34
+ where(state: 'SUCCEEDED').
35
+ order(Sequel.desc(:id)).
36
+ first
37
+
38
+ last_import.nil? ? nil : last_import[:id]
39
+ end
40
+
41
+ private
42
+
43
+ def import_runs_table
44
+ "#{BeetleETL.config.stage_schema}__import_runs".to_sym
45
+ end
46
+
47
+ def import_already_running?
48
+ import_runs_dataset.where(state: 'RUNNING').count > 0
49
+ end
50
+
51
+ def now
52
+ Time.now
53
+ end
54
+
55
+ def mark_as(state)
56
+ import_runs_dataset.filter(id: run_id).update(
57
+ state: state,
58
+ finished_at: now
59
+ )
60
+ end
61
+
62
+ def import_runs_dataset
63
+ BeetleETL.database[import_runs_table]
64
+ end
65
+
66
+ end
67
+ end
@@ -0,0 +1,54 @@
1
+ module BeetleETL
2
+ class AssignIds < Step
3
+
4
+ def dependencies
5
+ [TableDiff.step_name(table_name)].to_set
6
+ end
7
+
8
+ def run
9
+ assign_new_ids
10
+ map_existing_ids
11
+ end
12
+
13
+ def assign_new_ids
14
+ stage_table
15
+ .where(
16
+ import_run_id: run_id,
17
+ transition: 'CREATE'
18
+ )
19
+ .update(
20
+ id: Sequel.function(:NEXTVAL, "public.#{table_name}_id_seq")
21
+ )
22
+ end
23
+
24
+ def map_existing_ids
25
+ stage_table
26
+ .from(stage_table_identifier, public_table_identifier)
27
+ .where(
28
+ stage__import_run_id: run_id,
29
+ stage__transition: %w(KEEP UPDATE DELETE UNDELETE),
30
+ stage__external_id: :public__external_id
31
+ )
32
+ .update(id: :public__id)
33
+ end
34
+
35
+ private
36
+
37
+ def stage_table_identifier
38
+ :"#{stage_schema}__#{table_name}___stage"
39
+ end
40
+
41
+ def stage_table
42
+ database[stage_table_identifier]
43
+ end
44
+
45
+ def public_table_identifier
46
+ :"#{table_name}___public"
47
+ end
48
+
49
+ def public_table
50
+ database[public_table_identifier]
51
+ end
52
+
53
+ end
54
+ end