itiel 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.autotest +17 -0
- data/.gitignore +13 -0
- data/.gitlab-ci.yml +36 -0
- data/.rspec +2 -0
- data/.ruby-version +1 -0
- data/.travis.yml +9 -0
- data/Gemfile +4 -0
- data/Gemfile.rails.4.0 +7 -0
- data/Gemfile.rails.4.1 +7 -0
- data/Gemfile.rails.4.2 +7 -0
- data/README.markdown +106 -0
- data/Rakefile +13 -0
- data/build.sh +10 -0
- data/features/extract/database_table.feature +16 -0
- data/features/extract/sql_script.feature +17 -0
- data/features/load/database_table_loader.feature +21 -0
- data/features/lookup/csv_file.feature +41 -0
- data/features/lookup/database_table.feature +43 -0
- data/features/script/ruby_script.feature +19 -0
- data/features/step_definitions/csv_steps.rb +15 -0
- data/features/step_definitions/extractor/csv_file_steps.rb +3 -0
- data/features/step_definitions/extractor/custom_sql_steps.rb +6 -0
- data/features/step_definitions/extractor/database_steps.rb +27 -0
- data/features/step_definitions/extractor/database_table_steps.rb +8 -0
- data/features/step_definitions/extractor/extraction_steps.rb +3 -0
- data/features/step_definitions/flow_steps.rb +9 -0
- data/features/step_definitions/loader/csv_file_steps.rb +4 -0
- data/features/step_definitions/loader/database_table_steps.rb +14 -0
- data/features/step_definitions/lookup/lookup_steps.rb +35 -0
- data/features/step_definitions/scripting/ruby_script_steps.rb +5 -0
- data/features/step_definitions/stream_steps.rb +8 -0
- data/features/step_definitions/transformation/calculated_column_steps.rb +5 -0
- data/features/step_definitions/transformation/calculated_columns_steps.rb +7 -0
- data/features/step_definitions/transformation/constant_column_steps.rb +3 -0
- data/features/step_definitions/transformation/map_values_step.rb +4 -0
- data/features/step_definitions/transformation/rename_column_steps.rb +3 -0
- data/features/step_definitions/transformation/select_column_steps.rb +3 -0
- data/features/step_definitions/transformation/single_column_sort_steps.rb +3 -0
- data/features/support/database.yml +1 -0
- data/features/support/env.rb +13 -0
- data/features/transform/transformations.feature +123 -0
- data/itiel.gemspec +34 -0
- data/lib/itiel.rb +45 -0
- data/lib/itiel/db/connection.rb +24 -0
- data/lib/itiel/db/sql_connectable.rb +33 -0
- data/lib/itiel/db/truncator.rb +30 -0
- data/lib/itiel/extract/chained_step.rb +22 -0
- data/lib/itiel/extract/csv_file.rb +31 -0
- data/lib/itiel/extract/custom_sql.rb +38 -0
- data/lib/itiel/extract/database_table.rb +23 -0
- data/lib/itiel/job.rb +116 -0
- data/lib/itiel/load/chained_step.rb +37 -0
- data/lib/itiel/load/csv_file.rb +45 -0
- data/lib/itiel/load/database_table.rb +34 -0
- data/lib/itiel/load/input_output_behavior.rb +36 -0
- data/lib/itiel/logger.rb +47 -0
- data/lib/itiel/lookup/chained_step.rb +35 -0
- data/lib/itiel/lookup/csv_file.rb +16 -0
- data/lib/itiel/lookup/database_table.rb +36 -0
- data/lib/itiel/lookup/hash_lookup.rb +35 -0
- data/lib/itiel/nameable.rb +6 -0
- data/lib/itiel/script/chained_step.rb +18 -0
- data/lib/itiel/script/ruby_script.rb +31 -0
- data/lib/itiel/script/sql_script.rb +29 -0
- data/lib/itiel/transform/calculated_columns.rb +47 -0
- data/lib/itiel/transform/chained_step.rb +27 -0
- data/lib/itiel/transform/constant_column.rb +35 -0
- data/lib/itiel/transform/input_output_behavior.rb +44 -0
- data/lib/itiel/transform/map_values.rb +43 -0
- data/lib/itiel/transform/remove_column.rb +33 -0
- data/lib/itiel/transform/rename_column.rb +43 -0
- data/lib/itiel/transform/select_column.rb +37 -0
- data/lib/itiel/version.rb +3 -0
- data/spec/db/sql_connectable_spec.rb +20 -0
- data/spec/extract/chained_step_spec.rb +31 -0
- data/spec/extract/csv_file_spec.rb +22 -0
- data/spec/extract/custom_sql_spec.rb +19 -0
- data/spec/extract/database_table_spec.rb +22 -0
- data/spec/job_spec.rb +80 -0
- data/spec/loader/chained_step_spec.rb +39 -0
- data/spec/loader/csv_file_spec.rb +69 -0
- data/spec/loader/database_table_spec.rb +29 -0
- data/spec/lookup/hash_lookup_spec.rb +108 -0
- data/spec/nameable_spec.rb +17 -0
- data/spec/script/chained_step_spec.rb +24 -0
- data/spec/script/ruby_script_spec.rb +18 -0
- data/spec/script/sql_script_spec.rb +41 -0
- data/spec/spec_helper.rb +24 -0
- data/spec/support/config/database.yml +1 -0
- data/spec/support/config/sources.yml +9 -0
- data/spec/transform/calculated_columns_spec.rb +36 -0
- data/spec/transform/chained_step_spec.rb +36 -0
- data/spec/transform/constant_column_spec.rb +22 -0
- data/spec/transform/map_values_spec.rb +26 -0
- data/spec/transform/rename_column_spec.rb +25 -0
- data/spec/transform/select_column_spec.rb +21 -0
- metadata +344 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 319124d81efc2cd9b0963c9f1693862d7347170e
|
4
|
+
data.tar.gz: 9e89b3071f090742e273d80a0023f703064f7325
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: e4b71fb0ccdcec9e48cae895382e069b67680924b56901e034a10e6e452448081b2ea0fa5538e63e334ed2c15760dfa0143799795cbd753488fc72ba5b0d638f
|
7
|
+
data.tar.gz: cc404b8fa1e0404e93caa8ab29d031a1496f7d74fe4440d55a524ff623d6cc6afc4ca62e87c19806ae18955768e14d1facb30728f0a41a4c1453f75fc0419620
|
data/.autotest
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
require 'autotest/fsevent'
|
2
|
+
require 'autotest/growl'
|
3
|
+
require 'autotest/fsevent'
|
4
|
+
require 'autotest/timestamp'
|
5
|
+
|
6
|
+
Autotest.add_hook :initialize do |at|
|
7
|
+
at.clear_mappings
|
8
|
+
|
9
|
+
%w{.git coverage/ README gemspec DS_Store features/ Gemfile.lock Rakefile Gemfile}.each do |exception|
|
10
|
+
at.add_exception exception
|
11
|
+
end
|
12
|
+
|
13
|
+
at.add_mapping %r%^lib/itiel/(.*)\.rb$% do |_, m|
|
14
|
+
["test/unit/#{m[1]}_test.rb"]
|
15
|
+
end
|
16
|
+
|
17
|
+
end
|
data/.gitignore
ADDED
data/.gitlab-ci.yml
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
cache:
|
2
|
+
paths:
|
3
|
+
- vendor/ruby
|
4
|
+
|
5
|
+
before_script:
|
6
|
+
- ruby -v # Print out ruby version for debugging
|
7
|
+
- gem install bundler --no-ri --no-rdoc # Bundler is not installed with the image
|
8
|
+
- bundle install -j $(nproc) --path vendor # Install dependencies into ./vendor/ruby
|
9
|
+
|
10
|
+
tests:2.3.rails.4.0:
|
11
|
+
image: "ruby:2.3"
|
12
|
+
variables:
|
13
|
+
BUNDLE_GEMFILE: "Gemfile.rails.4.0"
|
14
|
+
script:
|
15
|
+
- bundle exec rake
|
16
|
+
|
17
|
+
tests:2.3.rails.4.1:
|
18
|
+
image: "ruby:2.3"
|
19
|
+
variables:
|
20
|
+
BUNDLE_GEMFILE: "Gemfile.rails.4.1"
|
21
|
+
script:
|
22
|
+
- bundle exec rake
|
23
|
+
|
24
|
+
tests:2.2.rails.4.0:
|
25
|
+
image: "ruby:2.2"
|
26
|
+
variables:
|
27
|
+
BUNDLE_GEMFILE: "Gemfile.rails.4.0"
|
28
|
+
script:
|
29
|
+
- bundle exec rake
|
30
|
+
|
31
|
+
tests:2.2.rails.4.1:
|
32
|
+
image: "ruby:2.2"
|
33
|
+
variables:
|
34
|
+
BUNDLE_GEMFILE: "Gemfile.rails.4.1"
|
35
|
+
script:
|
36
|
+
- bundle exec rake
|
data/.rspec
ADDED
data/.ruby-version
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
2.3.2
|
data/.travis.yml
ADDED
data/Gemfile
ADDED
data/Gemfile.rails.4.0
ADDED
data/Gemfile.rails.4.1
ADDED
data/Gemfile.rails.4.2
ADDED
data/README.markdown
ADDED
@@ -0,0 +1,106 @@
|
|
1
|
+
# Itiel
|
2
|
+
|
3
|
+
[![build status](https://gitlab.com/dabit/itiel/badges/master/build.svg)](https://gitlab.com/dabit/itiel/commits/master)
|
4
|
+
[![coverage report](https://gitlab.com/dabit/itiel/badges/master/coverage.svg)](https://gitlab.com/dabit/itiel/commits/master)
|
5
|
+
|
6
|
+
Hopefully, it will be an awesome Framework to do ETL with Ruby. It
|
7
|
+
should only work with *Ruby 1.9*.
|
8
|
+
|
9
|
+
[A working example](https://github.com/railsmx/rails-mx-blog/blob/master/itiel/posts.rb)
|
10
|
+
|
11
|
+
# This README is work in progress
|
12
|
+
|
13
|
+
You should not trust it, until I clean it up.
|
14
|
+
|
15
|
+
For better examples and documentation, refer to the *features* folder.
|
16
|
+
|
17
|
+
|
18
|
+
# TODO: Everything I want it to be, below:
|
19
|
+
|
20
|
+
### Define the Itiel project structure
|
21
|
+
|
22
|
+
I have in mind something like this:
|
23
|
+
|
24
|
+
/itiel_example/
|
25
|
+
|~config/
|
26
|
+
| `-sources.yml
|
27
|
+
|~lib/
|
28
|
+
| `-transformation_class1.rb
|
29
|
+
|~jobs/
|
30
|
+
| `-main_job.rb
|
31
|
+
|~tasks/
|
32
|
+
| `-itiel.rake
|
33
|
+
|-Gemfile
|
34
|
+
|-Gemfile.lock
|
35
|
+
|
36
|
+
### Controllers
|
37
|
+
|
38
|
+
Take care of defining and running the jobs. It's where you put all the
|
39
|
+
code like the example above.
|
40
|
+
|
41
|
+
### A command line bin to generate the basic structure
|
42
|
+
|
43
|
+
itiel new project_name.
|
44
|
+
|
45
|
+
Use thor?
|
46
|
+
|
47
|
+
### Rake
|
48
|
+
|
49
|
+
rake itiel:run
|
50
|
+
|
51
|
+
### Mongo
|
52
|
+
|
53
|
+
Picture yourself migrating form SQL to Mongo
|
54
|
+
|
55
|
+
### Joins, Scripts and lots more transformations
|
56
|
+
|
57
|
+
Some on my mind:
|
58
|
+
|
59
|
+
Itiel::Joins::DatabaseTable
|
60
|
+
Itiel::Lookups::DatabaseTable
|
61
|
+
Itiel::Loads::FTP
|
62
|
+
Itiel::Loads::HTTPRequest
|
63
|
+
Itiel::Loads::MongoDBDocument
|
64
|
+
Itiel::Script::ExecuteInSystem
|
65
|
+
Itiel::Extracts::HTTP
|
66
|
+
Itiel::Extracts::MongoDBDocument
|
67
|
+
Itiel::Transforms::CustomSort
|
68
|
+
Itiel::Transforms::MapValues
|
69
|
+
Itiel::Transforms::IfNil
|
70
|
+
|
71
|
+
Just ideas, I have nothing on how to implement most of them
|
72
|
+
|
73
|
+
### Filters and conditional stream routing
|
74
|
+
|
75
|
+
If the column X has a value of Y, send true output to this step, send
|
76
|
+
false output to this other steps
|
77
|
+
|
78
|
+
### Slowly Changing Dimentions
|
79
|
+
|
80
|
+
Can't have ETL without these
|
81
|
+
|
82
|
+
### Test helpers
|
83
|
+
|
84
|
+
So you can TDD your ETL
|
85
|
+
|
86
|
+
### Logging
|
87
|
+
|
88
|
+
It would be extra nice if all the steps logged statistics
|
89
|
+
|
90
|
+
* Processing X rows of Y
|
91
|
+
* Time spent
|
92
|
+
* Transformation Name
|
93
|
+
* Configurable log level
|
94
|
+
|
95
|
+
### Viz
|
96
|
+
|
97
|
+
rake itiel::graph
|
98
|
+
|
99
|
+
Creates a graphical representation of your ETL flow
|
100
|
+
|
101
|
+
### Ruby 1.8 and 1.9
|
102
|
+
|
103
|
+
Not really, Ruby 2.0+ is enough. I don't have plans to make it
|
104
|
+
work on 1.8 or 1.9
|
105
|
+
|
106
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
require 'bundler'
|
2
|
+
Bundler::GemHelper.install_tasks
|
3
|
+
|
4
|
+
require 'rspec/core/rake_task'
|
5
|
+
require 'cucumber/rake/task'
|
6
|
+
|
7
|
+
task :default => [:spec, :cucumber]
|
8
|
+
|
9
|
+
RSpec::Core::RakeTask.new
|
10
|
+
|
11
|
+
Cucumber::Rake::Task.new do |task|
|
12
|
+
task.cucumber_opts = %w{--format progress}
|
13
|
+
end
|
data/build.sh
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
Feature: Extract data from a Database Table
|
2
|
+
|
3
|
+
@database
|
4
|
+
Scenario:
|
5
|
+
Given a blogposts database table with rows:
|
6
|
+
| id | title | author |
|
7
|
+
| 1 | The great tool | Tim Taylor |
|
8
|
+
| 2 | A footbal anecdote | John Miller |
|
9
|
+
| 3 | Historical music | John Mellencamp |
|
10
|
+
|
11
|
+
# @database_table = Itiel::Extract::DatabaseTable.new
|
12
|
+
# @database_table.connection = :test
|
13
|
+
# @database_table.table_name = 'blogposts'
|
14
|
+
When I create a Itiel::Extract::DatabaseTable object for the 'blogposts' table
|
15
|
+
Then the extraction for that object should have 3 rows
|
16
|
+
|
@@ -0,0 +1,17 @@
|
|
1
|
+
Feature: Extract rows from a database using a custom SQL script
|
2
|
+
|
3
|
+
@database
|
4
|
+
Scenario:
|
5
|
+
Given a blogposts database table with rows:
|
6
|
+
| id | title | author |
|
7
|
+
| 1 | The great tool | Tim Taylor |
|
8
|
+
| 2 | A footbal anecdote | John Miller |
|
9
|
+
| 3 | Historical music | John Mellencamp |
|
10
|
+
|
11
|
+
When I create a Itiel::Extract::CustomSQL object with the query:
|
12
|
+
"""
|
13
|
+
SELECT * FROM blogposts WHERE id = 3;
|
14
|
+
"""
|
15
|
+
|
16
|
+
Then the extraction for that object should have 1 row
|
17
|
+
|
@@ -0,0 +1,21 @@
|
|
1
|
+
Feature: Load data into a database table
|
2
|
+
|
3
|
+
Background:
|
4
|
+
Given a blogposts database table
|
5
|
+
And a "source.csv" file with the following rows:
|
6
|
+
| id | title | author |
|
7
|
+
| 1 | Pride and Prejudice | Jane Austen |
|
8
|
+
| 2 | Treasure Island | R L Stevenson |
|
9
|
+
| 3 | Steve Jobs | Walter Isaacson |
|
10
|
+
| 4 | The thank you economy | Gary Vaynerchuk |
|
11
|
+
|
12
|
+
# @source = Itiel::Extracts::CSVFile.new('source.csv')
|
13
|
+
And I create a Extractor::CSVFile object with "source.csv"
|
14
|
+
# @destination = Itiel::Load::DatabaseTable.new :test, "blogposts"
|
15
|
+
And I create a Loader::DatabaseTable object for the "blogposts" table
|
16
|
+
|
17
|
+
@database
|
18
|
+
Scenario:
|
19
|
+
When I load the source into the destination
|
20
|
+
Then the "blogposts" table should have 4 records
|
21
|
+
|
@@ -0,0 +1,41 @@
|
|
1
|
+
Feature: Lookup data from a CSV file
|
2
|
+
|
3
|
+
@database
|
4
|
+
Scenario:
|
5
|
+
Given an orders database table with rows:
|
6
|
+
| id | email |
|
7
|
+
| 1 | john@example.com |
|
8
|
+
| 2 | tim@test.com |
|
9
|
+
| 3 | math@example.com |
|
10
|
+
|
11
|
+
And a "customers.csv" file with the following rows:
|
12
|
+
| id | email |
|
13
|
+
| 1 | john@example.com |
|
14
|
+
| 2 | tim@test.com |
|
15
|
+
| 3 | math@example.com |
|
16
|
+
|
17
|
+
# @database_table = Itiel::Extract::DatabaseTable.new
|
18
|
+
# @database_table.connection = :test
|
19
|
+
# @database_table.table_name = 'orders'
|
20
|
+
And I create a Itiel::Extract::DatabaseTable object for the 'orders' table
|
21
|
+
|
22
|
+
# @lookup = Itiel::Lookup::CSVFile.new('customers.csv')
|
23
|
+
And I create a Itiel::Lookup::CSVFile object with the "customers.csv" file
|
24
|
+
|
25
|
+
# @lookup.lookup_columns = { :email => :email }
|
26
|
+
And the lookup joins the "email" column in the source with the "email" column in the lookup stream
|
27
|
+
|
28
|
+
# @lookup.joined_columns = { :id => :customer_id }
|
29
|
+
And the lookup appends the "id" column on the lookup stream to the source as "customer_id"
|
30
|
+
|
31
|
+
And the data flows in the following direction:
|
32
|
+
| @database_table |
|
33
|
+
| @lookup |
|
34
|
+
|
35
|
+
When I start the source
|
36
|
+
|
37
|
+
Then the output for the lookup should be:
|
38
|
+
| id | email | customer_id |
|
39
|
+
| 1 | john@example.com | 1 |
|
40
|
+
| 2 | tim@test.com | 2 |
|
41
|
+
| 3 | math@example.com | 3 |
|
@@ -0,0 +1,43 @@
|
|
1
|
+
Feature: Lookup data from a Database Table
|
2
|
+
|
3
|
+
@database
|
4
|
+
Scenario:
|
5
|
+
Given a blogposts database table with rows:
|
6
|
+
| id | title | author |
|
7
|
+
| 1 | The great tool | Tim Taylor |
|
8
|
+
| 2 | A football anecdote | John Miller |
|
9
|
+
| 3 | Historical music | John Mellencamp |
|
10
|
+
|
11
|
+
And an authors database table with rows:
|
12
|
+
| id | name |
|
13
|
+
| 1 | Tim Taylor |
|
14
|
+
| 2 | John Miller |
|
15
|
+
| 3 | John Mellencamp |
|
16
|
+
|
17
|
+
# @database_table = Itiel::Extract::DatabaseTable.new
|
18
|
+
# @database_table.connection = :test
|
19
|
+
# @database_table.table_name = 'blogposts'
|
20
|
+
And I create a Itiel::Extract::DatabaseTable object for the 'blogposts' table
|
21
|
+
|
22
|
+
# @lookup = Itiel::Lookup::DatabaseTable.new
|
23
|
+
# @lookup.connection = :test
|
24
|
+
# @lookup.table_name = 'author'
|
25
|
+
And I create a Itiel::Lookup::DatabaseTable object for the "authors"
|
26
|
+
|
27
|
+
# @lookup.lookup_columns = { :author => :name }
|
28
|
+
And the lookup joins the "author" column in the source with the "name" column in the lookup stream
|
29
|
+
|
30
|
+
# @lookup.joined_columns = { :id => :author_id }
|
31
|
+
And the lookup appends the "id" column on the lookup stream to the source as "author_id"
|
32
|
+
|
33
|
+
And the data flows in the following direction:
|
34
|
+
| @database_table |
|
35
|
+
| @lookup |
|
36
|
+
|
37
|
+
When I start the source
|
38
|
+
|
39
|
+
Then the output for the lookup should be:
|
40
|
+
| id | title | author | author_id |
|
41
|
+
| 1 | The great tool | Tim Taylor | 1 |
|
42
|
+
| 2 | A football anecdote | John Miller | 2 |
|
43
|
+
| 3 | Historical music | John Mellencamp | 3 |
|
@@ -0,0 +1,19 @@
|
|
1
|
+
Feature: Ruby scripting
|
2
|
+
|
3
|
+
Scenario:
|
4
|
+
Given the following data is in the stream:
|
5
|
+
| id | title | author |
|
6
|
+
| 1 | The great tool | Tim Taylor |
|
7
|
+
| 2 | A football anecdote | John Miller |
|
8
|
+
| 3 | Historical music | John Mellencamp |
|
9
|
+
|
10
|
+
And I create a Itiel::Script::RubyScript object with the following block of code:
|
11
|
+
"""
|
12
|
+
row["slug"] = row["title"].parameterize
|
13
|
+
"""
|
14
|
+
|
15
|
+
Then the resulting output stream should be:
|
16
|
+
| id | title | author | slug |
|
17
|
+
| 1 | The great tool | Tim Taylor | the-great-tool |
|
18
|
+
| 2 | A football anecdote | John Miller | a-football-anecdote |
|
19
|
+
| 3 | Historical music | John Mellencamp | historical-music |
|
@@ -0,0 +1,15 @@
|
|
1
|
+
Given /^a "([^"]*)" file with the following rows:$/ do |filename, table|
|
2
|
+
CSV.open File.join('tmp', filename), "wb" do |csv|
|
3
|
+
table.raw.each { |row| csv << row }
|
4
|
+
end
|
5
|
+
end
|
6
|
+
|
7
|
+
Then /^the "([^"]*)" file should exist with the following content:$/ do |filename, table|
|
8
|
+
expected_file = CSV.generate do |csv|
|
9
|
+
table.raw.each { |row| csv << row }
|
10
|
+
end
|
11
|
+
|
12
|
+
resulting_file = File.new(File.join('tmp', filename), "rb").read
|
13
|
+
|
14
|
+
expect(resulting_file).to eq expected_file
|
15
|
+
end
|