rodimus 0.1.1 → 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +1 -1
- data/README.md +27 -1
- data/examples/csv_input_stdout.rb +37 -0
- data/examples/worldbank-sample.csv +49 -0
- data/lib/rodimus/step.rb +4 -1
- data/lib/rodimus/transformation.rb +4 -2
- data/lib/rodimus/version.rb +1 -1
- data/rodimus.gemspec +9 -9
- data/test/transformation_test.rb +27 -0
- metadata +7 -18
- data/examples/mongo_input.rb +0 -40
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 86ccd8d549d00dbd8a6c1ba56ce9eb1fe41aa496
|
4
|
+
data.tar.gz: efdf0cac67bec3a2c2cd469164c9f7bdc812ebd2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: fbfe473e374f6aa661d3cffca3c313b5811f3f332bdfd2860919022806c679383d7b20053ef5e56c4ee2a2fb8f13814c678b14463d0ba17a27b7dc9b76714344
|
7
|
+
data.tar.gz: 1d45b630f57c78f956f7b170ca5d2e8533bda42e88ad3240df112c197579bb95a6032e08deadc1ffe653a1a393eb61553f475815d4d178a1b5c82251afd478b7
|
data/.travis.yml
CHANGED
data/README.md
CHANGED
@@ -31,7 +31,33 @@ Or install it yourself as:
|
|
31
31
|
|
32
32
|
## Usage
|
33
33
|
|
34
|
-
See the examples directory for the quickest path to success.
|
34
|
+
tl;dr: See the examples directory for the quickest path to success.
|
35
|
+
|
36
|
+
A transformation is an operation that consists of many steps. Each step may
|
37
|
+
manipulate the data in some way. Typically, the first step is reserved for
|
38
|
+
reading from your data source, and the last step is used to write to the new
|
39
|
+
destination.
|
40
|
+
|
41
|
+
In Rodimus, you create a transformation object, and then you add
|
42
|
+
one or more steps to its array of steps. You typically create steps by writing
|
43
|
+
your own classes that include the Rodimus::Step mixin. When the transformation is
|
44
|
+
subsequently run, a new process is forked for each step. All processes are
|
45
|
+
connected together using pipes except for the first and last steps (those being the
|
46
|
+
source and destination steps). Each step then consumes rows of data from its
|
47
|
+
incoming pipe and performs some operation on it before writing it to the
|
48
|
+
outgoing pipe.
|
49
|
+
|
50
|
+
There are several methods on the Rodimus::Step mixin that are able to be
|
51
|
+
overridden for custom processing behavior before, during, or after the each
|
52
|
+
row is handled. If those aren't enough, you're also free to manipulate the
|
53
|
+
input/output objects (i.e. to redirect to standard out).
|
54
|
+
|
55
|
+
The Rodimus approach is to provide a minimal, flexible framework upon which
|
56
|
+
custom ETL solutions can be built. ETL is complex, and there tend to be many
|
57
|
+
subtle differences between projects which can make things like establishing
|
58
|
+
conventions and encouraging code reuse difficult. Rodimus is an attempt to
|
59
|
+
codify those things which are probably useful to a majority of ETL projects
|
60
|
+
with as little overhead as possible.
|
35
61
|
|
36
62
|
## Contributing
|
37
63
|
|
@@ -0,0 +1,37 @@
|
|
1
|
+
require 'rodimus'
|
2
|
+
require 'csv'
|
3
|
+
require 'json'
|
4
|
+
|
5
|
+
class CsvInput
|
6
|
+
include Rodimus::Step
|
7
|
+
|
8
|
+
def initialize
|
9
|
+
@incoming = CSV.open('worldbank-sample.csv')
|
10
|
+
@incoming.readline # skip the headers
|
11
|
+
end
|
12
|
+
|
13
|
+
def process_row(row)
|
14
|
+
row.to_json
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
class FormattedText
|
19
|
+
include Rodimus::Step
|
20
|
+
|
21
|
+
def initialize
|
22
|
+
@outgoing = STDOUT.dup
|
23
|
+
end
|
24
|
+
|
25
|
+
def process_row(row)
|
26
|
+
data = JSON.parse(row)
|
27
|
+
"In #{data.first} during #{data[1]}, CO2 emissions were #{data[2]} metric tons per capita."
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
t = Rodimus::Transformation.new
|
32
|
+
s1 = CsvInput.new
|
33
|
+
s2 = FormattedText.new
|
34
|
+
t.steps << s1
|
35
|
+
t.steps << s2
|
36
|
+
t.run
|
37
|
+
puts "Transformation complete!"
|
@@ -0,0 +1,49 @@
|
|
1
|
+
Country,Year,CO2 emissions (metric tons per capita),Electric power consumption (kWh per capita),Energy use (kg of oil equivalent per capita),"Fertility rate, total (births per woman)","GNI per capita, Atlas method (current US$)","Internet users (per 1,000 people)","Life expectancy at birth, total (years)",Military expenditure (% of GDP),"Population, total","Prevalence of HIV, total (% of population ages 15-49)"
|
2
|
+
Belarus,2000,5.91,2988.71,2459.67,1.29,1.38E+03,18.69,68.01,1.26,1.00E+07,
|
3
|
+
Belarus,2001,5.87,2996.81,2476.16,,1300,43.15,,1.44,9970260,
|
4
|
+
Belarus,2002,6.03,2982.77,2539.95,1.25,1370,89.8,68.21,1.4,9925000,
|
5
|
+
Belarus,2003,6.33,3039.1,2628.83,1.25,1610,162.76,,1.3,9873968,0.31
|
6
|
+
Belarus,2004,,3143.58,2725.44,1.24,2150,250.51,68.39,1.36,9824469,
|
7
|
+
Belarus,2005,,,,1.24,2760,347.23,68.48,1.24,9775591,0.34
|
8
|
+
Philippines,2000,1.03,514.02,559.83,,1.06E+03,20.33,69.53,1.08,7.58E+07,
|
9
|
+
Philippines,2001,0.99,535.18,539.4,,1.05E+03,25.89,,0.99,7.72E+07,
|
10
|
+
Philippines,2002,0.99,539.74,537.47,3.5,1.03E+03,44.47,70.19,0.98,7.87E+07,
|
11
|
+
Philippines,2003,0.96,574.47,524.9,,1.08E+03,49.9,,1.03,8.02E+07,0.1
|
12
|
+
Philippines,2004,,597.06,542.39,,1.20E+03,53.91,,0.9,8.16E+07,
|
13
|
+
Philippines,2005,,,,3.2,1.29E+03,,71.04,0.82,8.31E+07,0.1
|
14
|
+
Morocco,2000,1.2,489.04,350.32,2.62,1.19E+03,7.03,68.81,4.13,2.85E+07,
|
15
|
+
Morocco,2001,1.32,508.1,370.93,2.5,1.20E+03,13.87,,4.08,2.88E+07,
|
16
|
+
Morocco,2002,1.32,526.4,373.38,2.5,1.19E+03,23.99,69.48,4.27,2.92E+07,
|
17
|
+
Morocco,2003,1.28,562.93,370.01,2.5,1.34E+03,33.87,,4.22,2.95E+07,0.09
|
18
|
+
Morocco,2004,,594.3,383.8,2.43,1.57E+03,117.3,,4.54,2.98E+07,
|
19
|
+
Morocco,2005,,,,2.4,1.75E+03,152.61,70.38,4.28,3.01E+07,0.1
|
20
|
+
Afghanistan,2000,0.04,,,,,,,,,
|
21
|
+
Afghanistan,2001,0.03,,,,,,,,,
|
22
|
+
Afghanistan,2002,0.03,,,,,0.04,,,,
|
23
|
+
Afghanistan,2003,0.03,,,,,0.73,,,,0.1
|
24
|
+
Afghanistan,2004,,,,,,0.87,,,,
|
25
|
+
Afghanistan,2005,,,,,,1,,,,0.1
|
26
|
+
Nicaragua,2000,0.77,349.37,558.5,3.54,760,10.16,68.87,0.78,4920286,
|
27
|
+
Nicaragua,2001,0.8,349.6,559.75,,760,15.03,,0.71,4991475,
|
28
|
+
Nicaragua,2002,0.78,373.83,552.24,3.3,760,17.82,69.48,0.87,5050368,
|
29
|
+
Nicaragua,2003,0.77,373.59,613.75,,790,19.62,,0.87,5096507,0.21
|
30
|
+
Nicaragua,2004,,416.57,643.39,,870,24.4,,0.7,5122841,
|
31
|
+
Nicaragua,2005,,,,3.08,950,27.19,70.39,0.69,5149311,0.24
|
32
|
+
"Korea, Dem. Rep.",2000,3.52,747.14,903.3,2.02E+00,,,63.1,,2.19E+07,
|
33
|
+
"Korea, Dem. Rep.",2001,3.63,772.66,928.63,,,0,,,2.20E+07,
|
34
|
+
"Korea, Dem. Rep.",2002,3.43,751.85,882.16,2.00E+00,,,63.04,,2.21E+07,
|
35
|
+
"Korea, Dem. Rep.",2003,3.48,793.87,894.09,,,,,,2.23E+07,0.2
|
36
|
+
"Korea, Dem. Rep.",2004,,826.54,910.17,,,,,,2.24E+07,
|
37
|
+
"Korea, Dem. Rep.",2005,,,,1.96E+00,,,63.92,,2.25E+07,0.2
|
38
|
+
Kyrgyz Republic,2000,0.94,1687.38,497.43,2.4,280,10.5,68.56,2.85,4915300,
|
39
|
+
Kyrgyz Republic,2001,0.78,1443.05,451.08,2.4,280,30.39,68.71,2.35,4954800,
|
40
|
+
Kyrgyz Republic,2002,0.99,1365.26,507.89,2.5,290,30.44,68.11,2.73,4993200,
|
41
|
+
Kyrgyz Republic,2003,1.06,1646.69,531.5,2.5,340,39.69,68.26,2.87,5038600,0.1
|
42
|
+
Kyrgyz Republic,2004,,1421.22,545.87,2.45,400,51.64,68.15,2.85,5092802,
|
43
|
+
Kyrgyz Republic,2005,,,,2.41,450,54.44,68.34,,5143500,0.14
|
44
|
+
Middle East & North Africa,2000,3.21,1075.27,1076.66,3.27,1.66E+03,9.02,68.14,4.32,2.79E+08,
|
45
|
+
Middle East & North Africa,2001,3.21,1122.92,1104.58,,1.71E+03,11.96,,4.59,2.84E+08,
|
46
|
+
Middle East & North Africa,2002,3.5,1176.9,1145.34,3.11,1.70E+03,28.94,68.79,4.18,2.90E+08,
|
47
|
+
Middle East & North Africa,2003,3.43,1221.77,1157.82,,1.81E+03,41.89,,4.29,2.95E+08,0.12
|
48
|
+
Middle East & North Africa,2004,,1290.24,1190.15,,1.98E+03,60.28,,4.28,3.00E+08,
|
49
|
+
Middle East & North Africa,2005,,,,2.97,2.22E+03,88.9,69.67,3.73,3.06E+08,0.15
|
data/lib/rodimus/step.rb
CHANGED
@@ -32,9 +32,12 @@ module Rodimus
|
|
32
32
|
|
33
33
|
def run
|
34
34
|
Rodimus.logger.info "Running #{self}"
|
35
|
+
@row_count = 1
|
35
36
|
incoming.each do |row|
|
36
37
|
transformed_row = process_row(row)
|
37
38
|
handle_output(transformed_row)
|
39
|
+
Rodimus.logger.info(self) { "#{@row_count} rows processed" } if @row_count % 50000 == 0
|
40
|
+
@row_count += 1
|
38
41
|
end
|
39
42
|
finalize
|
40
43
|
Rodimus.logger.info "Finished #{self}"
|
@@ -43,7 +46,7 @@ module Rodimus
|
|
43
46
|
end
|
44
47
|
|
45
48
|
def to_s
|
46
|
-
"#{self.class} connected to input: #{incoming} and output: #{outgoing}"
|
49
|
+
"#{self.class} connected to input: #{incoming || 'nil'} and output: #{outgoing || 'nil'}"
|
47
50
|
end
|
48
51
|
end
|
49
52
|
|
@@ -3,22 +3,24 @@ require 'drb'
|
|
3
3
|
module Rodimus
|
4
4
|
|
5
5
|
class Transformation
|
6
|
-
attr_reader :drb_server, :steps
|
6
|
+
attr_reader :drb_server, :pids, :steps
|
7
7
|
|
8
8
|
# User-data accessible across all running steps.
|
9
9
|
attr_reader :shared_data
|
10
10
|
|
11
11
|
def initialize
|
12
12
|
@steps = []
|
13
|
+
@pids = []
|
13
14
|
@shared_data = {} # TODO: This needs to be thread safe
|
14
15
|
end
|
15
16
|
|
16
17
|
def run
|
17
18
|
@drb_server = DRb.start_service(nil, shared_data)
|
19
|
+
pids.clear
|
18
20
|
prepare
|
19
21
|
|
20
22
|
steps.each do |step|
|
21
|
-
fork do
|
23
|
+
pids << fork do
|
22
24
|
DRb.start_service # the parent DRb thread dies across the fork
|
23
25
|
step.shared_data = DRbObject.new_with_uri(drb_server.uri)
|
24
26
|
step.run
|
data/lib/rodimus/version.rb
CHANGED
data/rodimus.gemspec
CHANGED
@@ -4,14 +4,15 @@ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
|
4
4
|
require 'rodimus/version'
|
5
5
|
|
6
6
|
Gem::Specification.new do |spec|
|
7
|
-
spec.name
|
8
|
-
spec.version
|
9
|
-
spec.authors
|
10
|
-
spec.email
|
11
|
-
spec.summary
|
12
|
-
spec.description
|
13
|
-
spec.homepage
|
14
|
-
spec.license
|
7
|
+
spec.name = "rodimus"
|
8
|
+
spec.version = Rodimus::VERSION
|
9
|
+
spec.authors = ["Brandon Rice"]
|
10
|
+
spec.email = ["brice84@gmail.com"]
|
11
|
+
spec.summary = "An ETL (Extract-Transform-Load) library that uses a forking process model for concurrency."
|
12
|
+
spec.description = "An ETL (Extract-Transform-Load) library that uses a forking process model for concurrency."
|
13
|
+
spec.homepage = "https://github.com/nevern02/rodimus"
|
14
|
+
spec.license = "MIT"
|
15
|
+
spec.required_ruby_version = ">= 1.9.2"
|
15
16
|
|
16
17
|
spec.files = `git ls-files -z`.split("\x0")
|
17
18
|
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
@@ -20,5 +21,4 @@ Gem::Specification.new do |spec|
|
|
20
21
|
|
21
22
|
spec.add_development_dependency "bundler", "~> 1.5"
|
22
23
|
spec.add_development_dependency "rake"
|
23
|
-
spec.add_development_dependency "mongo"
|
24
24
|
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
require 'minitest/autorun'
|
2
|
+
require 'rodimus'
|
3
|
+
|
4
|
+
module Rodimus
|
5
|
+
|
6
|
+
class TestTransformation < MiniTest::Unit::TestCase
|
7
|
+
Rodimus.configure do |config|
|
8
|
+
config.logger = Logger.new(nil)
|
9
|
+
end
|
10
|
+
|
11
|
+
def test_forking_processes
|
12
|
+
incoming = StringIO.new
|
13
|
+
transformation = Transformation.new
|
14
|
+
steps = []
|
15
|
+
number_of_steps = 2 + rand(5)
|
16
|
+
number_of_steps.times { steps << Object.new }
|
17
|
+
steps.each do |step|
|
18
|
+
step.extend(Rodimus::Step)
|
19
|
+
transformation.steps << step
|
20
|
+
end
|
21
|
+
steps.first.incoming = incoming
|
22
|
+
transformation.run
|
23
|
+
assert_equal(steps.count, transformation.pids.count)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rodimus
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Brandon Rice
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-05-
|
11
|
+
date: 2014-05-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -38,20 +38,6 @@ dependencies:
|
|
38
38
|
- - ">="
|
39
39
|
- !ruby/object:Gem::Version
|
40
40
|
version: '0'
|
41
|
-
- !ruby/object:Gem::Dependency
|
42
|
-
name: mongo
|
43
|
-
requirement: !ruby/object:Gem::Requirement
|
44
|
-
requirements:
|
45
|
-
- - ">="
|
46
|
-
- !ruby/object:Gem::Version
|
47
|
-
version: '0'
|
48
|
-
type: :development
|
49
|
-
prerelease: false
|
50
|
-
version_requirements: !ruby/object:Gem::Requirement
|
51
|
-
requirements:
|
52
|
-
- - ">="
|
53
|
-
- !ruby/object:Gem::Version
|
54
|
-
version: '0'
|
55
41
|
description: An ETL (Extract-Transform-Load) library that uses a forking process model
|
56
42
|
for concurrency.
|
57
43
|
email:
|
@@ -68,7 +54,8 @@ files:
|
|
68
54
|
- LICENSE.txt
|
69
55
|
- README.md
|
70
56
|
- Rakefile
|
71
|
-
- examples/
|
57
|
+
- examples/csv_input_stdout.rb
|
58
|
+
- examples/worldbank-sample.csv
|
72
59
|
- lib/rodimus.rb
|
73
60
|
- lib/rodimus/configuration.rb
|
74
61
|
- lib/rodimus/step.rb
|
@@ -76,6 +63,7 @@ files:
|
|
76
63
|
- lib/rodimus/version.rb
|
77
64
|
- rodimus.gemspec
|
78
65
|
- test/step_test.rb
|
66
|
+
- test/transformation_test.rb
|
79
67
|
homepage: https://github.com/nevern02/rodimus
|
80
68
|
licenses:
|
81
69
|
- MIT
|
@@ -88,7 +76,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
88
76
|
requirements:
|
89
77
|
- - ">="
|
90
78
|
- !ruby/object:Gem::Version
|
91
|
-
version:
|
79
|
+
version: 1.9.2
|
92
80
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
93
81
|
requirements:
|
94
82
|
- - ">="
|
@@ -103,3 +91,4 @@ summary: An ETL (Extract-Transform-Load) library that uses a forking process mod
|
|
103
91
|
for concurrency.
|
104
92
|
test_files:
|
105
93
|
- test/step_test.rb
|
94
|
+
- test/transformation_test.rb
|
data/examples/mongo_input.rb
DELETED
@@ -1,40 +0,0 @@
|
|
1
|
-
require 'rodimus'
|
2
|
-
require 'mongo'
|
3
|
-
require 'json'
|
4
|
-
|
5
|
-
class MongoInput
|
6
|
-
attr_reader :client, :db, :collection
|
7
|
-
|
8
|
-
include Rodimus::Step
|
9
|
-
|
10
|
-
def initialize
|
11
|
-
@client = Mongo::MongoClient.new('localhost', 27017)
|
12
|
-
@db = client['inventory_events']
|
13
|
-
@collection = db['model_events']
|
14
|
-
@incoming = collection.find.limit(4)
|
15
|
-
end
|
16
|
-
|
17
|
-
def process_row(row)
|
18
|
-
row.to_json
|
19
|
-
end
|
20
|
-
end
|
21
|
-
|
22
|
-
class TempfileOut
|
23
|
-
include Rodimus::Step
|
24
|
-
|
25
|
-
def initialize
|
26
|
-
@outgoing = File.new('output.txt', 'w')
|
27
|
-
end
|
28
|
-
|
29
|
-
def process_row(row)
|
30
|
-
JSON.parse(row).keys.join(',')
|
31
|
-
end
|
32
|
-
end
|
33
|
-
|
34
|
-
t = Rodimus::Transformation.new
|
35
|
-
s1 = MongoInput.new
|
36
|
-
s2 = TempfileOut.new
|
37
|
-
t.steps << s1
|
38
|
-
t.steps << s2
|
39
|
-
t.run
|
40
|
-
puts "Transformation to #{s2.outgoing.path} complete!"
|