beetle_etl 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +23 -0
- data/.travis.yml +12 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +31 -0
- data/Rakefile +5 -0
- data/beetle_etl.gemspec +30 -0
- data/lib/beetle_etl.rb +85 -0
- data/lib/beetle_etl/dsl/dsl.rb +37 -0
- data/lib/beetle_etl/dsl/transformation.rb +26 -0
- data/lib/beetle_etl/dsl/transformation_loader.rb +22 -0
- data/lib/beetle_etl/import.rb +37 -0
- data/lib/beetle_etl/state.rb +67 -0
- data/lib/beetle_etl/steps/assign_ids.rb +54 -0
- data/lib/beetle_etl/steps/load.rb +108 -0
- data/lib/beetle_etl/steps/map_relations.rb +31 -0
- data/lib/beetle_etl/steps/step.rb +42 -0
- data/lib/beetle_etl/steps/table_diff.rb +155 -0
- data/lib/beetle_etl/steps/transform.rb +22 -0
- data/lib/beetle_etl/task_runner/dependency_resolver.rb +39 -0
- data/lib/beetle_etl/task_runner/task_runner.rb +64 -0
- data/lib/beetle_etl/version.rb +3 -0
- data/script/postgres +12 -0
- data/spec/beetle_etl_spec.rb +70 -0
- data/spec/dependency_resolver_spec.rb +57 -0
- data/spec/dsl/dsl_spec.rb +44 -0
- data/spec/dsl/transformation_loader_spec.rb +51 -0
- data/spec/dsl/transformation_spec.rb +54 -0
- data/spec/feature/example_schema.rb +192 -0
- data/spec/feature/example_transform.rb +37 -0
- data/spec/feature/feature_spec.rb +48 -0
- data/spec/import_spec.rb +7 -0
- data/spec/spec_helper.rb +25 -0
- data/spec/state_spec.rb +124 -0
- data/spec/steps/assign_ids_spec.rb +107 -0
- data/spec/steps/load_spec.rb +148 -0
- data/spec/steps/map_relations_spec.rb +92 -0
- data/spec/steps/step_spec.rb +37 -0
- data/spec/steps/table_diff_spec.rb +183 -0
- data/spec/steps/transform_spec.rb +34 -0
- data/spec/support/database.yml.example +9 -0
- data/spec/support/database.yml.travis +4 -0
- data/spec/support/database_helpers.rb +58 -0
- metadata +220 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: ebb61022b0e58e217f1b215486a993c7e43799a2
|
4
|
+
data.tar.gz: e00ad7086cf5be7c4cadf520cbd21c1ebd47202d
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 8e3c2be8adf3cb65807fddb95d1b09d1de9064f586f3e0aefa6026ff3e7b572b8f9653c31ca759114228e6cb6470495f21b74f7470ad6fb7745136210b87f31e
|
7
|
+
data.tar.gz: 3d91601c914486564b8db8e3afdef62c90b04b54fe3633321547de3ec437ee1f12ead442265e84561c4578d2dae482be2cebc85dfd610c76e3db877239d4a2d5
|
data/.gitignore
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
*.gem
|
2
|
+
*.rbc
|
3
|
+
.bundle
|
4
|
+
.config
|
5
|
+
.yardoc
|
6
|
+
Gemfile.lock
|
7
|
+
InstalledFiles
|
8
|
+
_yardoc
|
9
|
+
coverage
|
10
|
+
doc/
|
11
|
+
lib/bundler/man
|
12
|
+
pkg
|
13
|
+
rdoc
|
14
|
+
spec/reports
|
15
|
+
spec/support/database.yml
|
16
|
+
test/tmp
|
17
|
+
test/version_tmp
|
18
|
+
tmp
|
19
|
+
*.bundle
|
20
|
+
*.so
|
21
|
+
*.o
|
22
|
+
*.a
|
23
|
+
mkmf.log
|
data/.travis.yml
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
language: ruby
|
2
|
+
rvm:
|
3
|
+
- 2.0.0
|
4
|
+
- 2.1.2
|
5
|
+
addons:
|
6
|
+
postgresql: "9.3"
|
7
|
+
code_climate:
|
8
|
+
repo_token: dd18697b0acb6be343db62982b753b72676e8342701cc0442121de2d12ee6549
|
9
|
+
|
10
|
+
before_script:
|
11
|
+
- psql -c 'create database travis_ci_test;' -U postgres
|
12
|
+
- cp spec/support/database.yml.travis spec/support/database.yml
|
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2014 Luciano Maiwald
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,31 @@
|
|
1
|
+
# BeetleETL
|
2
|
+
[![Build Status](https://travis-ci.org/maiwald/beetle_etl.svg?branch=master)](https://travis-ci.org/maiwald/beetle_etl)
|
3
|
+
[![Code Climate](https://codeclimate.com/github/maiwald/beetle_etl.png)](https://codeclimate.com/github/maiwald/beetle_etl)
|
4
|
+
|
5
|
+
TODO: Write a gem description
|
6
|
+
|
7
|
+
## Installation
|
8
|
+
|
9
|
+
Add this line to your application's Gemfile:
|
10
|
+
|
11
|
+
gem 'beetle_etl'
|
12
|
+
|
13
|
+
And then execute:
|
14
|
+
|
15
|
+
$ bundle
|
16
|
+
|
17
|
+
Or install it yourself as:
|
18
|
+
|
19
|
+
$ gem install beetle_etl
|
20
|
+
|
21
|
+
## Usage
|
22
|
+
|
23
|
+
TODO: Write usage instructions here
|
24
|
+
|
25
|
+
## Contributing
|
26
|
+
|
27
|
+
1. Fork it ( https://github.com/maiwald/beetle_etl/fork )
|
28
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
29
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
30
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
31
|
+
5. Create a new Pull Request
|
data/Rakefile
ADDED
data/beetle_etl.gemspec
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'beetle_etl/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = 'beetle_etl'
|
8
|
+
spec.version = BeetleETL::VERSION
|
9
|
+
spec.authors = ['Luciano Maiwald']
|
10
|
+
spec.email = ['luciano.maiwald@gmail.com']
|
11
|
+
spec.summary = %q{BeetleETL helps you with your recurring ETL imports.}
|
12
|
+
spec.description = %q{Taking care of synchronizing external data with referential data in your application.}
|
13
|
+
spec.homepage = 'https://github.com/maiwald/beetle_etl'
|
14
|
+
spec.license = 'MIT'
|
15
|
+
|
16
|
+
spec.files = `git ls-files -z`.split("\x0")
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
+
spec.require_paths = ['lib']
|
20
|
+
|
21
|
+
spec.add_runtime_dependency 'sequel', '>= 4.13.0'
|
22
|
+
spec.add_runtime_dependency 'celluloid', '>= 0.15.2'
|
23
|
+
|
24
|
+
spec.add_development_dependency 'bundler', '~> 1.6'
|
25
|
+
spec.add_development_dependency 'rake'
|
26
|
+
spec.add_development_dependency 'rspec', '~> 3.0.0'
|
27
|
+
spec.add_development_dependency 'pg'
|
28
|
+
spec.add_development_dependency 'codeclimate-test-reporter'
|
29
|
+
spec.add_development_dependency 'activesupport'
|
30
|
+
end
|
data/lib/beetle_etl.rb
ADDED
@@ -0,0 +1,85 @@
|
|
1
|
+
require 'beetle_etl/version'
|
2
|
+
|
3
|
+
require 'sequel'
|
4
|
+
|
5
|
+
module BeetleETL
|
6
|
+
|
7
|
+
InvalidConfigurationError = Class.new(StandardError)
|
8
|
+
|
9
|
+
require 'beetle_etl/dsl/dsl'
|
10
|
+
require 'beetle_etl/dsl/transformation'
|
11
|
+
require 'beetle_etl/dsl/transformation_loader'
|
12
|
+
|
13
|
+
require 'beetle_etl/steps/step'
|
14
|
+
require 'beetle_etl/steps/transform'
|
15
|
+
require 'beetle_etl/steps/map_relations'
|
16
|
+
require 'beetle_etl/steps/table_diff'
|
17
|
+
require 'beetle_etl/steps/assign_ids'
|
18
|
+
require 'beetle_etl/steps/load'
|
19
|
+
|
20
|
+
require 'beetle_etl/task_runner/dependency_resolver'
|
21
|
+
require 'beetle_etl/task_runner/task_runner'
|
22
|
+
|
23
|
+
require 'beetle_etl/state'
|
24
|
+
require 'beetle_etl/import'
|
25
|
+
|
26
|
+
class Configuration
|
27
|
+
attr_accessor \
|
28
|
+
:database_config,
|
29
|
+
:database,
|
30
|
+
:transformation_file,
|
31
|
+
:stage_schema,
|
32
|
+
:external_source
|
33
|
+
|
34
|
+
def initialize
|
35
|
+
@stage_schema = 'stage'
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
class << self
|
40
|
+
|
41
|
+
def import
|
42
|
+
state.start_import
|
43
|
+
|
44
|
+
begin
|
45
|
+
Import.run
|
46
|
+
state.mark_as_succeeded
|
47
|
+
rescue Exception => e
|
48
|
+
state.mark_as_failed
|
49
|
+
raise e
|
50
|
+
ensure
|
51
|
+
@database.disconnect if @database
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
def configure
|
56
|
+
yield(config)
|
57
|
+
end
|
58
|
+
|
59
|
+
def config
|
60
|
+
@config ||= Configuration.new
|
61
|
+
end
|
62
|
+
|
63
|
+
def database
|
64
|
+
if config.database
|
65
|
+
config.database
|
66
|
+
elsif config.database_config
|
67
|
+
@database ||= Sequel.connect(config.database_config)
|
68
|
+
else
|
69
|
+
msg = "Either Sequel connection database_config or a Sequel Database object required"
|
70
|
+
raise InvalidConfigurationError.new(msg)
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
def state
|
75
|
+
@state ||= State.new
|
76
|
+
end
|
77
|
+
|
78
|
+
def reset
|
79
|
+
@config = nil
|
80
|
+
@state = nil
|
81
|
+
@database = nil
|
82
|
+
end
|
83
|
+
|
84
|
+
end
|
85
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
module BeetleETL
|
2
|
+
class DSL
|
3
|
+
|
4
|
+
attr_reader :relations, :query_string
|
5
|
+
|
6
|
+
def initialize(table_name)
|
7
|
+
@table_name = table_name
|
8
|
+
@relations = {}
|
9
|
+
end
|
10
|
+
|
11
|
+
def references(foreign_table, on: foreign_key)
|
12
|
+
@relations[on] = foreign_table
|
13
|
+
end
|
14
|
+
|
15
|
+
def query(query)
|
16
|
+
@query_string = query
|
17
|
+
end
|
18
|
+
|
19
|
+
|
20
|
+
def stage_table
|
21
|
+
%Q("#{BeetleETL.config.stage_schema}"."#{@table_name}")
|
22
|
+
end
|
23
|
+
|
24
|
+
def external_source
|
25
|
+
'source'
|
26
|
+
end
|
27
|
+
|
28
|
+
def combined_key(*args)
|
29
|
+
%Q('[' || #{args.join(%q[ || ',' || ])} || ']')
|
30
|
+
end
|
31
|
+
|
32
|
+
def import_run_id
|
33
|
+
BeetleETL.state.run_id
|
34
|
+
end
|
35
|
+
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
require 'set'
|
2
|
+
|
3
|
+
module BeetleETL
|
4
|
+
class Transformation
|
5
|
+
|
6
|
+
attr_reader :table_name
|
7
|
+
|
8
|
+
def initialize(table_name, setup)
|
9
|
+
@table_name = table_name
|
10
|
+
(@parsed = DSL.new(table_name)).instance_eval(&setup)
|
11
|
+
end
|
12
|
+
|
13
|
+
def relations
|
14
|
+
@parsed.relations
|
15
|
+
end
|
16
|
+
|
17
|
+
def dependencies
|
18
|
+
relations.values.to_set
|
19
|
+
end
|
20
|
+
|
21
|
+
def query
|
22
|
+
@parsed.query_string
|
23
|
+
end
|
24
|
+
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
module BeetleETL
|
2
|
+
module TransformationLoader
|
3
|
+
extend self
|
4
|
+
|
5
|
+
def load
|
6
|
+
@transformations = []
|
7
|
+
|
8
|
+
File.open(BeetleETL.config.transformation_file, 'r') do |file|
|
9
|
+
instance_eval file.read
|
10
|
+
end
|
11
|
+
|
12
|
+
@transformations
|
13
|
+
end
|
14
|
+
|
15
|
+
private
|
16
|
+
|
17
|
+
def import(table_name, &setup)
|
18
|
+
@transformations << Transformation.new(table_name, setup)
|
19
|
+
end
|
20
|
+
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
module BeetleETL
|
2
|
+
module Import
|
3
|
+
|
4
|
+
extend self
|
5
|
+
|
6
|
+
def run
|
7
|
+
TaskRunner.run(data_steps)
|
8
|
+
BeetleETL.database.transaction do
|
9
|
+
load_steps.each(&:run)
|
10
|
+
end
|
11
|
+
end
|
12
|
+
|
13
|
+
private
|
14
|
+
|
15
|
+
def data_steps
|
16
|
+
transformations.map do |t|
|
17
|
+
[
|
18
|
+
Transform.new(t.table_name, t.dependencies, t.query),
|
19
|
+
MapRelations.new(t.table_name, t.relations),
|
20
|
+
TableDiff.new(t.table_name),
|
21
|
+
AssignIds.new(t.table_name),
|
22
|
+
]
|
23
|
+
end.flatten
|
24
|
+
end
|
25
|
+
|
26
|
+
def load_steps
|
27
|
+
transformations.map do |t|
|
28
|
+
Load.new(t.table_name)
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
def transformations
|
33
|
+
@transformations ||= TransformationLoader.load
|
34
|
+
end
|
35
|
+
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,67 @@
|
|
1
|
+
module BeetleETL
|
2
|
+
|
3
|
+
ImportAleadyRunning = Class.new(StandardError)
|
4
|
+
ImportSchemaNotFound = Class.new(StandardError)
|
5
|
+
ImportNotRunning = Class.new(StandardError)
|
6
|
+
|
7
|
+
class State
|
8
|
+
|
9
|
+
def start_import
|
10
|
+
raise ImportAleadyRunning if import_already_running?
|
11
|
+
|
12
|
+
@run_id = import_runs_dataset.insert(
|
13
|
+
state: 'RUNNING',
|
14
|
+
started_at: now
|
15
|
+
)
|
16
|
+
end
|
17
|
+
|
18
|
+
def mark_as_succeeded
|
19
|
+
mark_as('SUCCEEDED')
|
20
|
+
end
|
21
|
+
|
22
|
+
def mark_as_failed
|
23
|
+
mark_as('FAILED')
|
24
|
+
end
|
25
|
+
|
26
|
+
def run_id
|
27
|
+
raise ImportNotRunning if @run_id.nil?
|
28
|
+
@run_id
|
29
|
+
end
|
30
|
+
|
31
|
+
def last_run_id
|
32
|
+
last_import = import_runs_dataset.
|
33
|
+
select(:id).
|
34
|
+
where(state: 'SUCCEEDED').
|
35
|
+
order(Sequel.desc(:id)).
|
36
|
+
first
|
37
|
+
|
38
|
+
last_import.nil? ? nil : last_import[:id]
|
39
|
+
end
|
40
|
+
|
41
|
+
private
|
42
|
+
|
43
|
+
def import_runs_table
|
44
|
+
"#{BeetleETL.config.stage_schema}__import_runs".to_sym
|
45
|
+
end
|
46
|
+
|
47
|
+
def import_already_running?
|
48
|
+
import_runs_dataset.where(state: 'RUNNING').count > 0
|
49
|
+
end
|
50
|
+
|
51
|
+
def now
|
52
|
+
Time.now
|
53
|
+
end
|
54
|
+
|
55
|
+
def mark_as(state)
|
56
|
+
import_runs_dataset.filter(id: run_id).update(
|
57
|
+
state: state,
|
58
|
+
finished_at: now
|
59
|
+
)
|
60
|
+
end
|
61
|
+
|
62
|
+
def import_runs_dataset
|
63
|
+
BeetleETL.database[import_runs_table]
|
64
|
+
end
|
65
|
+
|
66
|
+
end
|
67
|
+
end
|
@@ -0,0 +1,54 @@
|
|
1
|
+
module BeetleETL
|
2
|
+
class AssignIds < Step
|
3
|
+
|
4
|
+
def dependencies
|
5
|
+
[TableDiff.step_name(table_name)].to_set
|
6
|
+
end
|
7
|
+
|
8
|
+
def run
|
9
|
+
assign_new_ids
|
10
|
+
map_existing_ids
|
11
|
+
end
|
12
|
+
|
13
|
+
def assign_new_ids
|
14
|
+
stage_table
|
15
|
+
.where(
|
16
|
+
import_run_id: run_id,
|
17
|
+
transition: 'CREATE'
|
18
|
+
)
|
19
|
+
.update(
|
20
|
+
id: Sequel.function(:NEXTVAL, "public.#{table_name}_id_seq")
|
21
|
+
)
|
22
|
+
end
|
23
|
+
|
24
|
+
def map_existing_ids
|
25
|
+
stage_table
|
26
|
+
.from(stage_table_identifier, public_table_identifier)
|
27
|
+
.where(
|
28
|
+
stage__import_run_id: run_id,
|
29
|
+
stage__transition: %w(KEEP UPDATE DELETE UNDELETE),
|
30
|
+
stage__external_id: :public__external_id
|
31
|
+
)
|
32
|
+
.update(id: :public__id)
|
33
|
+
end
|
34
|
+
|
35
|
+
private
|
36
|
+
|
37
|
+
def stage_table_identifier
|
38
|
+
:"#{stage_schema}__#{table_name}___stage"
|
39
|
+
end
|
40
|
+
|
41
|
+
def stage_table
|
42
|
+
database[stage_table_identifier]
|
43
|
+
end
|
44
|
+
|
45
|
+
def public_table_identifier
|
46
|
+
:"#{table_name}___public"
|
47
|
+
end
|
48
|
+
|
49
|
+
def public_table
|
50
|
+
database[public_table_identifier]
|
51
|
+
end
|
52
|
+
|
53
|
+
end
|
54
|
+
end
|