beetle_etl 1.0.1 → 2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.byebug_history +8 -0
- data/.travis.yml +6 -1
- data/README.md +31 -9
- data/beetle_etl.gemspec +1 -1
- data/lib/beetle_etl.rb +7 -49
- data/lib/beetle_etl/configuration.rb +39 -0
- data/lib/beetle_etl/dsl/dsl.rb +6 -2
- data/lib/beetle_etl/dsl/transformation.rb +2 -2
- data/lib/beetle_etl/dsl/transformation_loader.rb +4 -3
- data/lib/beetle_etl/import.rb +15 -11
- data/lib/beetle_etl/naming.rb +10 -20
- data/lib/beetle_etl/reporter.rb +3 -2
- data/lib/beetle_etl/step_runner/async_step_runner.rb +6 -4
- data/lib/beetle_etl/steps/create_stage.rb +2 -2
- data/lib/beetle_etl/steps/load.rb +2 -2
- data/lib/beetle_etl/steps/map_relations.rb +2 -2
- data/lib/beetle_etl/steps/step.rb +23 -4
- data/lib/beetle_etl/steps/transform.rb +2 -2
- data/lib/beetle_etl/testing.rb +10 -5
- data/lib/beetle_etl/testing/test_wrapper.rb +4 -4
- data/lib/beetle_etl/version.rb +1 -1
- data/spec/beetle_etl_spec.rb +6 -38
- data/spec/configuration_spec.rb +66 -0
- data/spec/dsl/dsl_spec.rb +9 -3
- data/spec/dsl/transformation_loader_spec.rb +9 -8
- data/spec/dsl/transformation_spec.rb +9 -7
- data/spec/feature/feature_spec.rb +8 -8
- data/spec/reporter_spec.rb +5 -2
- data/spec/spec_helper.rb +4 -5
- data/spec/steps/assign_ids_spec.rb +7 -7
- data/spec/steps/create_stage_spec.rb +14 -12
- data/spec/steps/load_spec.rb +9 -7
- data/spec/steps/map_relations_spec.rb +14 -8
- data/spec/steps/step_spec.rb +5 -3
- data/spec/steps/table_diff_spec.rb +7 -6
- data/spec/steps/transform_spec.rb +8 -4
- data/spec/testing_spec.rb +1 -1
- metadata +9 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6a726a9734d6866687319a6742cc8db41ef68b64
|
4
|
+
data.tar.gz: 82be15d660033bd3d957879ec351c0479b26623f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 01c1408c035afb9d0dcadb9382a3db442318f48c74517f1376c3a6ef615675ef891df8559c172be6e6b1f7f47d2f29c4a6965493a613d93a24d439c40f224246
|
7
|
+
data.tar.gz: 111f73fd2692ed88f4ce91c6f4da838efe13d1d12fc00bfb4f63cb00383509ffa1d171bf265362ded8f01ddde7f0a0e0659d552f7be8b31188c94bb4bb984e59
|
data/.byebug_history
ADDED
data/.travis.yml
CHANGED
@@ -1,11 +1,16 @@
|
|
1
1
|
language: ruby
|
2
2
|
rvm:
|
3
3
|
- 2.0.0
|
4
|
-
- 2.1.
|
4
|
+
- 2.1.0
|
5
|
+
- 2.2.0
|
6
|
+
- 2.3.0
|
5
7
|
addons:
|
6
8
|
postgresql: "9.3"
|
7
9
|
code_climate:
|
8
10
|
repo_token: fcd6d8c28da900609a2cf903716d858621b8ce68152edbcebe6908a9a3f5d3d5
|
11
|
+
before_install:
|
12
|
+
- gem update --system
|
13
|
+
- gem update bundler
|
9
14
|
|
10
15
|
before_script:
|
11
16
|
- psql -c 'create database travis_ci_test;' -U postgres
|
data/README.md
CHANGED
@@ -32,12 +32,34 @@ Make sure the tables you want to import contain columns named ```external_id```
|
|
32
32
|
|
33
33
|
### Configuration
|
34
34
|
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
config.
|
40
|
-
|
35
|
+
Create a configuration object
|
36
|
+
|
37
|
+
configuration = BeetleETL::Configuration.new do |config|
|
38
|
+
# path to your transformation file
|
39
|
+
config.transformation_file = "../my_fancy_transformations"
|
40
|
+
|
41
|
+
# sequel database config
|
42
|
+
config.database_config = {
|
43
|
+
adapter: 'postgres'
|
44
|
+
encoding: utf8
|
45
|
+
host: my_host
|
46
|
+
database: my_database
|
47
|
+
username: 'foo'
|
48
|
+
password: 'bar'
|
49
|
+
pool: 5
|
50
|
+
pool_timeout: 360
|
51
|
+
connect_timeout: 360
|
52
|
+
}
|
53
|
+
# or config.database = # sequel database instance
|
54
|
+
|
55
|
+
# name of your soruce
|
56
|
+
config.external_source = "important_data"
|
57
|
+
|
58
|
+
# target schema in case you use postgres schemas
|
59
|
+
config.target_schema = "public" # default
|
60
|
+
|
61
|
+
# logger
|
62
|
+
config.logger = Logger.new(STDOUT) # default
|
41
63
|
end
|
42
64
|
|
43
65
|
### Defining Imports
|
@@ -66,8 +88,8 @@ Fill a ```transformation``` file with import directives like this:
|
|
66
88
|
ON data.org_id = o.id
|
67
89
|
SQL
|
68
90
|
end
|
69
|
-
|
70
|
-
|
91
|
+
|
92
|
+
|
71
93
|
```import``` takes the name of the table you want to fill and the configuration as arguments.
|
72
94
|
With ```columns``` you define what columns BeetleETL is supposed to fill in your application’s table.
|
73
95
|
The ```query``` transforms the data. Make sure that you insert into ```#{stage_table}``` as the name of the actual table, that this inserts into will be filled in by BeetleETL during runtime.
|
@@ -76,7 +98,7 @@ Define any foreign references your table has to other tables using the ```refrec
|
|
76
98
|
|
77
99
|
### Running BeetleETL
|
78
100
|
|
79
|
-
BeetleETL.import
|
101
|
+
BeetleETL.import(configuration)
|
80
102
|
|
81
103
|
## Development
|
82
104
|
|
data/beetle_etl.gemspec
CHANGED
@@ -21,7 +21,7 @@ Gem::Specification.new do |spec|
|
|
21
21
|
spec.add_runtime_dependency 'sequel', '>= 4.0.0'
|
22
22
|
spec.add_runtime_dependency 'activesupport', '>= 4.0.0'
|
23
23
|
|
24
|
-
spec.add_development_dependency 'bundler', '~> 1.
|
24
|
+
spec.add_development_dependency 'bundler', '~> 1.11'
|
25
25
|
spec.add_development_dependency 'rspec', '>= 3.0.0'
|
26
26
|
spec.add_development_dependency 'timecop', '>= 0.7.0'
|
27
27
|
spec.add_development_dependency 'pg', '>= 0.18.0'
|
data/lib/beetle_etl.rb
CHANGED
@@ -5,7 +5,7 @@ require 'logger'
|
|
5
5
|
|
6
6
|
module BeetleETL
|
7
7
|
|
8
|
-
|
8
|
+
require 'beetle_etl/configuration'
|
9
9
|
|
10
10
|
require 'beetle_etl/dsl/dsl'
|
11
11
|
require 'beetle_etl/dsl/transformation'
|
@@ -28,61 +28,19 @@ module BeetleETL
|
|
28
28
|
require 'beetle_etl/import'
|
29
29
|
require 'beetle_etl/reporter'
|
30
30
|
|
31
|
-
class Configuration
|
32
|
-
attr_accessor \
|
33
|
-
:database_config,
|
34
|
-
:database,
|
35
|
-
:transformation_file,
|
36
|
-
:stage_schema,
|
37
|
-
:target_schema,
|
38
|
-
:external_source,
|
39
|
-
:logger
|
40
|
-
|
41
|
-
def initialize
|
42
|
-
@target_schema = 'public'
|
43
|
-
@logger = ::Logger.new(STDOUT)
|
44
|
-
end
|
45
|
-
end
|
46
|
-
|
47
31
|
class << self
|
48
32
|
|
49
|
-
def import
|
33
|
+
def import(config = Configuration.new)
|
34
|
+
yield config if block_given?
|
35
|
+
|
50
36
|
begin
|
51
|
-
report = Import.new.run
|
52
|
-
Reporter.new(report).log_summary
|
37
|
+
report = Import.new(config).run
|
38
|
+
Reporter.new(config, report).log_summary
|
53
39
|
report
|
54
40
|
ensure
|
55
|
-
|
56
|
-
end
|
57
|
-
end
|
58
|
-
|
59
|
-
def configure
|
60
|
-
yield(config)
|
61
|
-
end
|
62
|
-
|
63
|
-
def config
|
64
|
-
@config ||= Configuration.new
|
65
|
-
end
|
66
|
-
|
67
|
-
def logger
|
68
|
-
config.logger
|
69
|
-
end
|
70
|
-
|
71
|
-
def database
|
72
|
-
if config.database
|
73
|
-
config.database
|
74
|
-
elsif config.database_config
|
75
|
-
@database ||= Sequel.connect(config.database_config)
|
76
|
-
else
|
77
|
-
msg = "Either Sequel connection database_config or a Sequel Database object required"
|
78
|
-
raise InvalidConfigurationError.new(msg)
|
41
|
+
config.disconnect_database
|
79
42
|
end
|
80
43
|
end
|
81
44
|
|
82
|
-
def reset
|
83
|
-
@config = nil
|
84
|
-
@database = nil
|
85
|
-
end
|
86
|
-
|
87
45
|
end
|
88
46
|
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
module BeetleETL
|
2
|
+
InvalidConfigurationError = Class.new(StandardError)
|
3
|
+
|
4
|
+
class Configuration
|
5
|
+
attr_accessor \
|
6
|
+
:transformation_file,
|
7
|
+
:stage_schema,
|
8
|
+
:external_source,
|
9
|
+
:logger
|
10
|
+
|
11
|
+
attr_writer \
|
12
|
+
:database,
|
13
|
+
:database_config,
|
14
|
+
:target_schema
|
15
|
+
|
16
|
+
def initialize
|
17
|
+
@target_schema = 'public'
|
18
|
+
@logger = ::Logger.new(STDOUT)
|
19
|
+
end
|
20
|
+
|
21
|
+
def database
|
22
|
+
if [@database, @database_config].none?
|
23
|
+
msg = "Either Sequel connection database_config or a Sequel Database object required"
|
24
|
+
raise InvalidConfigurationError.new(msg)
|
25
|
+
end
|
26
|
+
|
27
|
+
@database ||= Sequel.connect(@database_config)
|
28
|
+
end
|
29
|
+
|
30
|
+
def disconnect_database
|
31
|
+
database.disconnect if @database_config
|
32
|
+
end
|
33
|
+
|
34
|
+
def target_schema
|
35
|
+
@target_schema != 'public' ? @target_schema : nil
|
36
|
+
end
|
37
|
+
|
38
|
+
end
|
39
|
+
end
|
data/lib/beetle_etl/dsl/dsl.rb
CHANGED
@@ -3,7 +3,8 @@ module BeetleETL
|
|
3
3
|
|
4
4
|
attr_reader :column_names, :relations, :query_strings
|
5
5
|
|
6
|
-
def initialize(table_name)
|
6
|
+
def initialize(config, table_name)
|
7
|
+
@config = config
|
7
8
|
@table_name = table_name
|
8
9
|
@column_names = []
|
9
10
|
@relations = {}
|
@@ -25,7 +26,10 @@ module BeetleETL
|
|
25
26
|
# query helper methods
|
26
27
|
|
27
28
|
def stage_table(table_name = nil)
|
28
|
-
BeetleETL::Naming.stage_table_name_sql(
|
29
|
+
BeetleETL::Naming.stage_table_name_sql(
|
30
|
+
@config.external_source,
|
31
|
+
table_name || @table_name
|
32
|
+
)
|
29
33
|
end
|
30
34
|
|
31
35
|
def combined_key(*args)
|
@@ -5,9 +5,9 @@ module BeetleETL
|
|
5
5
|
|
6
6
|
attr_reader :table_name
|
7
7
|
|
8
|
-
def initialize(table_name, setup, helpers = nil)
|
8
|
+
def initialize(config, table_name, setup, helpers = nil)
|
9
9
|
@table_name = table_name
|
10
|
-
@parsed = DSL.new(table_name).tap do |dsl|
|
10
|
+
@parsed = DSL.new(config, table_name).tap do |dsl|
|
11
11
|
dsl.instance_exec(&helpers) if helpers
|
12
12
|
dsl.instance_exec(&setup)
|
13
13
|
end
|
@@ -1,18 +1,19 @@
|
|
1
1
|
module BeetleETL
|
2
2
|
class TransformationLoader
|
3
3
|
|
4
|
-
def initialize
|
4
|
+
def initialize(config)
|
5
|
+
@config = config
|
5
6
|
@transformations = []
|
6
7
|
@helper_definitions = nil
|
7
8
|
end
|
8
9
|
|
9
10
|
def load
|
10
|
-
File.open(
|
11
|
+
File.open(@config.transformation_file, 'r') do |file|
|
11
12
|
instance_eval file.read
|
12
13
|
end
|
13
14
|
|
14
15
|
@transformations.map do |(table_name, setup)|
|
15
|
-
Transformation.new(table_name, setup, @helper_definitions)
|
16
|
+
Transformation.new(@config, table_name, setup, @helper_definitions)
|
16
17
|
end
|
17
18
|
end
|
18
19
|
|
data/lib/beetle_etl/import.rb
CHANGED
@@ -3,6 +3,10 @@ require 'active_support/core_ext/hash/deep_merge'
|
|
3
3
|
module BeetleETL
|
4
4
|
class Import
|
5
5
|
|
6
|
+
def initialize(config)
|
7
|
+
@config = config
|
8
|
+
end
|
9
|
+
|
6
10
|
def run
|
7
11
|
setup
|
8
12
|
import
|
@@ -12,14 +16,14 @@ module BeetleETL
|
|
12
16
|
|
13
17
|
def setup
|
14
18
|
transformations.each do |t|
|
15
|
-
CreateStage.new(t.table_name, t.relations, t.column_names).run
|
19
|
+
CreateStage.new(@config, t.table_name, t.relations, t.column_names).run
|
16
20
|
end
|
17
21
|
end
|
18
22
|
|
19
23
|
def import
|
20
|
-
data_report = AsyncStepRunner.new(data_steps).run
|
21
|
-
load_report =
|
22
|
-
AsyncStepRunner.new(load_steps).run
|
24
|
+
data_report = AsyncStepRunner.new(@config, data_steps).run
|
25
|
+
load_report = @config.database.transaction do
|
26
|
+
AsyncStepRunner.new(@config, load_steps).run
|
23
27
|
end
|
24
28
|
|
25
29
|
data_report.deep_merge load_report
|
@@ -27,7 +31,7 @@ module BeetleETL
|
|
27
31
|
|
28
32
|
def cleanup
|
29
33
|
transformations.each do |t|
|
30
|
-
DropStage.new(t.table_name).run
|
34
|
+
DropStage.new(@config, t.table_name).run
|
31
35
|
end
|
32
36
|
end
|
33
37
|
|
@@ -36,22 +40,22 @@ module BeetleETL
|
|
36
40
|
def data_steps
|
37
41
|
transformations.flat_map do |t|
|
38
42
|
[
|
39
|
-
Transform.new(t.table_name, t.dependencies, t.query),
|
40
|
-
MapRelations.new(t.table_name, t.relations),
|
41
|
-
TableDiff.new(t.table_name),
|
42
|
-
AssignIds.new(t.table_name),
|
43
|
+
Transform.new(@config, t.table_name, t.dependencies, t.query),
|
44
|
+
MapRelations.new(@config, t.table_name, t.relations),
|
45
|
+
TableDiff.new(@config, t.table_name),
|
46
|
+
AssignIds.new(@config, t.table_name),
|
43
47
|
]
|
44
48
|
end
|
45
49
|
end
|
46
50
|
|
47
51
|
def load_steps
|
48
52
|
transformations.map do |t|
|
49
|
-
Load.new(t.table_name, t.relations)
|
53
|
+
Load.new(@config, t.table_name, t.relations)
|
50
54
|
end
|
51
55
|
end
|
52
56
|
|
53
57
|
def transformations
|
54
|
-
@transformations ||= TransformationLoader.new.load
|
58
|
+
@transformations ||= TransformationLoader.new(@config).load
|
55
59
|
end
|
56
60
|
|
57
61
|
end
|
data/lib/beetle_etl/naming.rb
CHANGED
@@ -5,32 +5,22 @@ module BeetleETL
|
|
5
5
|
|
6
6
|
extend self
|
7
7
|
|
8
|
-
def stage_table_name(table_name
|
9
|
-
|
10
|
-
digest
|
11
|
-
"#{BeetleETL.config.external_source}-#{name}-#{digest}"[0, 63]
|
8
|
+
def stage_table_name(external_source, table_name)
|
9
|
+
digest = Digest::MD5.hexdigest(table_name.to_s)
|
10
|
+
"#{external_source.to_s}-#{table_name.to_s}-#{digest}"[0, 63]
|
12
11
|
end
|
13
12
|
|
14
|
-
def stage_table_name_sql(table_name
|
15
|
-
%Q("#{stage_table_name(table_name)}")
|
13
|
+
def stage_table_name_sql(external_source, table_name)
|
14
|
+
%Q("#{stage_table_name(external_source, table_name)}")
|
16
15
|
end
|
17
16
|
|
18
|
-
def target_table_name(table_name
|
19
|
-
|
20
|
-
[
|
17
|
+
def target_table_name(target_schema, table_name)
|
18
|
+
schema = target_schema ? target_schema.to_s : nil
|
19
|
+
[schema, table_name.to_s].compact.join('.')
|
21
20
|
end
|
22
21
|
|
23
|
-
def target_table_name_sql(table_name
|
24
|
-
|
25
|
-
target_table_name= [target_schema, name].compact.join('"."')
|
26
|
-
%Q("#{target_table_name}")
|
27
|
-
end
|
28
|
-
|
29
|
-
private
|
30
|
-
|
31
|
-
def target_schema
|
32
|
-
target_schema = BeetleETL.config.target_schema
|
33
|
-
target_schema != 'public' ? target_schema : nil
|
22
|
+
def target_table_name_sql(target_schema, table_name)
|
23
|
+
%Q("#{target_table_name(target_schema, table_name)}")
|
34
24
|
end
|
35
25
|
|
36
26
|
end
|
data/lib/beetle_etl/reporter.rb
CHANGED
@@ -1,7 +1,9 @@
|
|
1
1
|
module BeetleETL
|
2
2
|
class AsyncStepRunner
|
3
3
|
|
4
|
-
def initialize(steps)
|
4
|
+
def initialize(config, steps)
|
5
|
+
@config = config
|
6
|
+
|
5
7
|
@dependency_resolver = DependencyResolver.new(steps)
|
6
8
|
@steps = steps
|
7
9
|
|
@@ -39,14 +41,14 @@ module BeetleETL
|
|
39
41
|
def run_step_async(step)
|
40
42
|
Thread.new do
|
41
43
|
begin
|
42
|
-
|
44
|
+
@config.logger.info("started step #{step.name}")
|
43
45
|
|
44
46
|
started_at = Time.now
|
45
47
|
step.run
|
46
48
|
finished_at = Time.now
|
47
49
|
|
48
50
|
duration = Time.at(finished_at - started_at).utc.strftime("%H:%M:%S")
|
49
|
-
|
51
|
+
@config.logger.info("finished #{step.name} in #{duration}")
|
50
52
|
|
51
53
|
@queue.push [
|
52
54
|
step.table_name,
|
@@ -55,7 +57,7 @@ module BeetleETL
|
|
55
57
|
]
|
56
58
|
|
57
59
|
rescue => e
|
58
|
-
|
60
|
+
@config.logger.fatal(e.message)
|
59
61
|
raise e
|
60
62
|
end
|
61
63
|
end.abort_on_exception = true
|