itiel 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.autotest +17 -0
- data/.gitignore +13 -0
- data/.gitlab-ci.yml +36 -0
- data/.rspec +2 -0
- data/.ruby-version +1 -0
- data/.travis.yml +9 -0
- data/Gemfile +4 -0
- data/Gemfile.rails.4.0 +7 -0
- data/Gemfile.rails.4.1 +7 -0
- data/Gemfile.rails.4.2 +7 -0
- data/README.markdown +106 -0
- data/Rakefile +13 -0
- data/build.sh +10 -0
- data/features/extract/database_table.feature +16 -0
- data/features/extract/sql_script.feature +17 -0
- data/features/load/database_table_loader.feature +21 -0
- data/features/lookup/csv_file.feature +41 -0
- data/features/lookup/database_table.feature +43 -0
- data/features/script/ruby_script.feature +19 -0
- data/features/step_definitions/csv_steps.rb +15 -0
- data/features/step_definitions/extractor/csv_file_steps.rb +3 -0
- data/features/step_definitions/extractor/custom_sql_steps.rb +6 -0
- data/features/step_definitions/extractor/database_steps.rb +27 -0
- data/features/step_definitions/extractor/database_table_steps.rb +8 -0
- data/features/step_definitions/extractor/extraction_steps.rb +3 -0
- data/features/step_definitions/flow_steps.rb +9 -0
- data/features/step_definitions/loader/csv_file_steps.rb +4 -0
- data/features/step_definitions/loader/database_table_steps.rb +14 -0
- data/features/step_definitions/lookup/lookup_steps.rb +35 -0
- data/features/step_definitions/scripting/ruby_script_steps.rb +5 -0
- data/features/step_definitions/stream_steps.rb +8 -0
- data/features/step_definitions/transformation/calculated_column_steps.rb +5 -0
- data/features/step_definitions/transformation/calculated_columns_steps.rb +7 -0
- data/features/step_definitions/transformation/constant_column_steps.rb +3 -0
- data/features/step_definitions/transformation/map_values_step.rb +4 -0
- data/features/step_definitions/transformation/rename_column_steps.rb +3 -0
- data/features/step_definitions/transformation/select_column_steps.rb +3 -0
- data/features/step_definitions/transformation/single_column_sort_steps.rb +3 -0
- data/features/support/database.yml +1 -0
- data/features/support/env.rb +13 -0
- data/features/transform/transformations.feature +123 -0
- data/itiel.gemspec +34 -0
- data/lib/itiel.rb +45 -0
- data/lib/itiel/db/connection.rb +24 -0
- data/lib/itiel/db/sql_connectable.rb +33 -0
- data/lib/itiel/db/truncator.rb +30 -0
- data/lib/itiel/extract/chained_step.rb +22 -0
- data/lib/itiel/extract/csv_file.rb +31 -0
- data/lib/itiel/extract/custom_sql.rb +38 -0
- data/lib/itiel/extract/database_table.rb +23 -0
- data/lib/itiel/job.rb +116 -0
- data/lib/itiel/load/chained_step.rb +37 -0
- data/lib/itiel/load/csv_file.rb +45 -0
- data/lib/itiel/load/database_table.rb +34 -0
- data/lib/itiel/load/input_output_behavior.rb +36 -0
- data/lib/itiel/logger.rb +47 -0
- data/lib/itiel/lookup/chained_step.rb +35 -0
- data/lib/itiel/lookup/csv_file.rb +16 -0
- data/lib/itiel/lookup/database_table.rb +36 -0
- data/lib/itiel/lookup/hash_lookup.rb +35 -0
- data/lib/itiel/nameable.rb +6 -0
- data/lib/itiel/script/chained_step.rb +18 -0
- data/lib/itiel/script/ruby_script.rb +31 -0
- data/lib/itiel/script/sql_script.rb +29 -0
- data/lib/itiel/transform/calculated_columns.rb +47 -0
- data/lib/itiel/transform/chained_step.rb +27 -0
- data/lib/itiel/transform/constant_column.rb +35 -0
- data/lib/itiel/transform/input_output_behavior.rb +44 -0
- data/lib/itiel/transform/map_values.rb +43 -0
- data/lib/itiel/transform/remove_column.rb +33 -0
- data/lib/itiel/transform/rename_column.rb +43 -0
- data/lib/itiel/transform/select_column.rb +37 -0
- data/lib/itiel/version.rb +3 -0
- data/spec/db/sql_connectable_spec.rb +20 -0
- data/spec/extract/chained_step_spec.rb +31 -0
- data/spec/extract/csv_file_spec.rb +22 -0
- data/spec/extract/custom_sql_spec.rb +19 -0
- data/spec/extract/database_table_spec.rb +22 -0
- data/spec/job_spec.rb +80 -0
- data/spec/loader/chained_step_spec.rb +39 -0
- data/spec/loader/csv_file_spec.rb +69 -0
- data/spec/loader/database_table_spec.rb +29 -0
- data/spec/lookup/hash_lookup_spec.rb +108 -0
- data/spec/nameable_spec.rb +17 -0
- data/spec/script/chained_step_spec.rb +24 -0
- data/spec/script/ruby_script_spec.rb +18 -0
- data/spec/script/sql_script_spec.rb +41 -0
- data/spec/spec_helper.rb +24 -0
- data/spec/support/config/database.yml +1 -0
- data/spec/support/config/sources.yml +9 -0
- data/spec/transform/calculated_columns_spec.rb +36 -0
- data/spec/transform/chained_step_spec.rb +36 -0
- data/spec/transform/constant_column_spec.rb +22 -0
- data/spec/transform/map_values_spec.rb +26 -0
- data/spec/transform/rename_column_spec.rb +25 -0
- data/spec/transform/select_column_spec.rb +21 -0
- metadata +344 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 319124d81efc2cd9b0963c9f1693862d7347170e
|
4
|
+
data.tar.gz: 9e89b3071f090742e273d80a0023f703064f7325
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: e4b71fb0ccdcec9e48cae895382e069b67680924b56901e034a10e6e452448081b2ea0fa5538e63e334ed2c15760dfa0143799795cbd753488fc72ba5b0d638f
|
7
|
+
data.tar.gz: cc404b8fa1e0404e93caa8ab29d031a1496f7d74fe4440d55a524ff623d6cc6afc4ca62e87c19806ae18955768e14d1facb30728f0a41a4c1453f75fc0419620
|
data/.autotest
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
require 'autotest/fsevent'
|
2
|
+
require 'autotest/growl'
|
3
|
+
require 'autotest/fsevent'
|
4
|
+
require 'autotest/timestamp'
|
5
|
+
|
6
|
+
Autotest.add_hook :initialize do |at|
|
7
|
+
at.clear_mappings
|
8
|
+
|
9
|
+
%w{.git coverage/ README gemspec DS_Store features/ Gemfile.lock Rakefile Gemfile}.each do |exception|
|
10
|
+
at.add_exception exception
|
11
|
+
end
|
12
|
+
|
13
|
+
at.add_mapping %r%^lib/itiel/(.*)\.rb$% do |_, m|
|
14
|
+
["test/unit/#{m[1]}_test.rb"]
|
15
|
+
end
|
16
|
+
|
17
|
+
end
|
data/.gitignore
ADDED
data/.gitlab-ci.yml
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
cache:
|
2
|
+
paths:
|
3
|
+
- vendor/ruby
|
4
|
+
|
5
|
+
before_script:
|
6
|
+
- ruby -v # Print out ruby version for debugging
|
7
|
+
- gem install bundler --no-ri --no-rdoc # Bundler is not installed with the image
|
8
|
+
- bundle install -j $(nproc) --path vendor # Install dependencies into ./vendor/ruby
|
9
|
+
|
10
|
+
tests:2.3.rails.4.0:
|
11
|
+
image: "ruby:2.3"
|
12
|
+
variables:
|
13
|
+
BUNDLE_GEMFILE: "Gemfile.rails.4.0"
|
14
|
+
script:
|
15
|
+
- bundle exec rake
|
16
|
+
|
17
|
+
tests:2.3.rails.4.1:
|
18
|
+
image: "ruby:2.3"
|
19
|
+
variables:
|
20
|
+
BUNDLE_GEMFILE: "Gemfile.rails.4.1"
|
21
|
+
script:
|
22
|
+
- bundle exec rake
|
23
|
+
|
24
|
+
tests:2.2.rails.4.0:
|
25
|
+
image: "ruby:2.2"
|
26
|
+
variables:
|
27
|
+
BUNDLE_GEMFILE: "Gemfile.rails.4.0"
|
28
|
+
script:
|
29
|
+
- bundle exec rake
|
30
|
+
|
31
|
+
tests:2.2.rails.4.1:
|
32
|
+
image: "ruby:2.2"
|
33
|
+
variables:
|
34
|
+
BUNDLE_GEMFILE: "Gemfile.rails.4.1"
|
35
|
+
script:
|
36
|
+
- bundle exec rake
|
data/.rspec
ADDED
data/.ruby-version
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
2.3.2
|
data/.travis.yml
ADDED
data/Gemfile
ADDED
data/Gemfile.rails.4.0
ADDED
data/Gemfile.rails.4.1
ADDED
data/Gemfile.rails.4.2
ADDED
data/README.markdown
ADDED
@@ -0,0 +1,106 @@
|
|
1
|
+
# Itiel
|
2
|
+
|
3
|
+
[](https://gitlab.com/dabit/itiel/commits/master)
|
4
|
+
[](https://gitlab.com/dabit/itiel/commits/master)
|
5
|
+
|
6
|
+
Hopefully, it will be an awesome Framework to do ETL with Ruby. It
|
7
|
+
should only work with *Ruby 1.9*.
|
8
|
+
|
9
|
+
[A working example](https://github.com/railsmx/rails-mx-blog/blob/master/itiel/posts.rb)
|
10
|
+
|
11
|
+
# This README is work in progress
|
12
|
+
|
13
|
+
You should not trust it, until I clean it up.
|
14
|
+
|
15
|
+
For better examples and documentation, refer to the *features* folder.
|
16
|
+
|
17
|
+
|
18
|
+
# TODO: Everything I want it to be, below:
|
19
|
+
|
20
|
+
### Define the Itiel project structure
|
21
|
+
|
22
|
+
I have in mind something like this:
|
23
|
+
|
24
|
+
/itiel_example/
|
25
|
+
|~config/
|
26
|
+
| `-sources.yml
|
27
|
+
|~lib/
|
28
|
+
| `-transformation_class1.rb
|
29
|
+
|~jobs/
|
30
|
+
| `-main_job.rb
|
31
|
+
|~tasks/
|
32
|
+
| `-itiel.rake
|
33
|
+
|-Gemfile
|
34
|
+
|-Gemfile.lock
|
35
|
+
|
36
|
+
### Controllers
|
37
|
+
|
38
|
+
Take care of defining and running the jobs. It's where you put all the
|
39
|
+
code like the example above.
|
40
|
+
|
41
|
+
### A command line bin to generate the basic structure
|
42
|
+
|
43
|
+
itiel new project_name.
|
44
|
+
|
45
|
+
Use thor?
|
46
|
+
|
47
|
+
### Rake
|
48
|
+
|
49
|
+
rake itiel:run
|
50
|
+
|
51
|
+
### Mongo
|
52
|
+
|
53
|
+
Picture yourself migrating form SQL to Mongo
|
54
|
+
|
55
|
+
### Joins, Scripts and lots more transformations
|
56
|
+
|
57
|
+
Some on my mind:
|
58
|
+
|
59
|
+
Itiel::Joins::DatabaseTable
|
60
|
+
Itiel::Lookups::DatabaseTable
|
61
|
+
Itiel::Loads::FTP
|
62
|
+
Itiel::Loads::HTTPRequest
|
63
|
+
Itiel::Loads::MongoDBDocument
|
64
|
+
Itiel::Script::ExecuteInSystem
|
65
|
+
Itiel::Extracts::HTTP
|
66
|
+
Itiel::Extracts::MongoDBDocument
|
67
|
+
Itiel::Transforms::CustomSort
|
68
|
+
Itiel::Transforms::MapValues
|
69
|
+
Itiel::Transforms::IfNil
|
70
|
+
|
71
|
+
Just ideas, I have nothing on how to implement most of them
|
72
|
+
|
73
|
+
### Filters and conditional stream routing
|
74
|
+
|
75
|
+
If the column X has a value of Y, send true output to this step, send
|
76
|
+
false output to this other steps
|
77
|
+
|
78
|
+
### Slowly Changing Dimentions
|
79
|
+
|
80
|
+
Can't have ETL without these
|
81
|
+
|
82
|
+
### Test helpers
|
83
|
+
|
84
|
+
So you can TDD your ETL
|
85
|
+
|
86
|
+
### Logging
|
87
|
+
|
88
|
+
It would be extra nice if all the steps logged statistics
|
89
|
+
|
90
|
+
* Processing X rows of Y
|
91
|
+
* Time spent
|
92
|
+
* Transformation Name
|
93
|
+
* Configurable log level
|
94
|
+
|
95
|
+
### Viz
|
96
|
+
|
97
|
+
rake itiel::graph
|
98
|
+
|
99
|
+
Creates a graphical representation of your ETL flow
|
100
|
+
|
101
|
+
### Ruby 1.8 and 1.9
|
102
|
+
|
103
|
+
Not really, Ruby 2.0+ is enough. I don't have plans to make it
|
104
|
+
work on 1.8 or 1.9
|
105
|
+
|
106
|
+
|
data/Rakefile
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
require 'bundler'
|
2
|
+
Bundler::GemHelper.install_tasks
|
3
|
+
|
4
|
+
require 'rspec/core/rake_task'
|
5
|
+
require 'cucumber/rake/task'
|
6
|
+
|
7
|
+
task :default => [:spec, :cucumber]
|
8
|
+
|
9
|
+
RSpec::Core::RakeTask.new
|
10
|
+
|
11
|
+
Cucumber::Rake::Task.new do |task|
|
12
|
+
task.cucumber_opts = %w{--format progress}
|
13
|
+
end
|
data/build.sh
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
Feature: Extract data from a Database Table
|
2
|
+
|
3
|
+
@database
|
4
|
+
Scenario:
|
5
|
+
Given a blogposts database table with rows:
|
6
|
+
| id | title | author |
|
7
|
+
| 1 | The great tool | Tim Taylor |
|
8
|
+
| 2 | A footbal anecdote | John Miller |
|
9
|
+
| 3 | Historical music | John Mellencamp |
|
10
|
+
|
11
|
+
# @database_table = Itiel::Extract::DatabaseTable.new
|
12
|
+
# @database_table.connection = :test
|
13
|
+
# @database_table.table_name = 'blogposts'
|
14
|
+
When I create a Itiel::Extract::DatabaseTable object for the 'blogposts' table
|
15
|
+
Then the extraction for that object should have 3 rows
|
16
|
+
|
@@ -0,0 +1,17 @@
|
|
1
|
+
Feature: Extract rows from a database using a custom SQL script
|
2
|
+
|
3
|
+
@database
|
4
|
+
Scenario:
|
5
|
+
Given a blogposts database table with rows:
|
6
|
+
| id | title | author |
|
7
|
+
| 1 | The great tool | Tim Taylor |
|
8
|
+
| 2 | A footbal anecdote | John Miller |
|
9
|
+
| 3 | Historical music | John Mellencamp |
|
10
|
+
|
11
|
+
When I create a Itiel::Extract::CustomSQL object with the query:
|
12
|
+
"""
|
13
|
+
SELECT * FROM blogposts WHERE id = 3;
|
14
|
+
"""
|
15
|
+
|
16
|
+
Then the extraction for that object should have 1 row
|
17
|
+
|
@@ -0,0 +1,21 @@
|
|
1
|
+
Feature: Load data into a database table
|
2
|
+
|
3
|
+
Background:
|
4
|
+
Given a blogposts database table
|
5
|
+
And a "source.csv" file with the following rows:
|
6
|
+
| id | title | author |
|
7
|
+
| 1 | Pride and Prejudice | Jane Austen |
|
8
|
+
| 2 | Treasure Island | R L Stevenson |
|
9
|
+
| 3 | Steve Jobs | Walter Isaacson |
|
10
|
+
| 4 | The thank you economy | Gary Vaynerchuk |
|
11
|
+
|
12
|
+
# @source = Itiel::Extracts::CSVFile.new('source.csv')
|
13
|
+
And I create a Extractor::CSVFile object with "source.csv"
|
14
|
+
# @destination = Itiel::Load::DatabaseTable.new :test, "blogposts"
|
15
|
+
And I create a Loader::DatabaseTable object for the "blogposts" table
|
16
|
+
|
17
|
+
@database
|
18
|
+
Scenario:
|
19
|
+
When I load the source into the destination
|
20
|
+
Then the "blogposts" table should have 4 records
|
21
|
+
|
@@ -0,0 +1,41 @@
|
|
1
|
+
Feature: Lookup data from a CSV file
|
2
|
+
|
3
|
+
@database
|
4
|
+
Scenario:
|
5
|
+
Given an orders database table with rows:
|
6
|
+
| id | email |
|
7
|
+
| 1 | john@example.com |
|
8
|
+
| 2 | tim@test.com |
|
9
|
+
| 3 | math@example.com |
|
10
|
+
|
11
|
+
And a "customers.csv" file with the following rows:
|
12
|
+
| id | email |
|
13
|
+
| 1 | john@example.com |
|
14
|
+
| 2 | tim@test.com |
|
15
|
+
| 3 | math@example.com |
|
16
|
+
|
17
|
+
# @database_table = Itiel::Extract::DatabaseTable.new
|
18
|
+
# @database_table.connection = :test
|
19
|
+
# @database_table.table_name = 'orders'
|
20
|
+
And I create a Itiel::Extract::DatabaseTable object for the 'orders' table
|
21
|
+
|
22
|
+
# @lookup = Itiel::Lookup::CSVFile.new('customers.csv')
|
23
|
+
And I create a Itiel::Lookup::CSVFile object with the "customers.csv" file
|
24
|
+
|
25
|
+
# @lookup.lookup_columns = { :email => :email }
|
26
|
+
And the lookup joins the "email" column in the source with the "email" column in the lookup stream
|
27
|
+
|
28
|
+
# @lookup.joined_columns = { :id => :customer_id }
|
29
|
+
And the lookup appends the "id" column on the lookup stream to the source as "customer_id"
|
30
|
+
|
31
|
+
And the data flows in the following direction:
|
32
|
+
| @database_table |
|
33
|
+
| @lookup |
|
34
|
+
|
35
|
+
When I start the source
|
36
|
+
|
37
|
+
Then the output for the lookup should be:
|
38
|
+
| id | email | customer_id |
|
39
|
+
| 1 | john@example.com | 1 |
|
40
|
+
| 2 | tim@test.com | 2 |
|
41
|
+
| 3 | math@example.com | 3 |
|
@@ -0,0 +1,43 @@
|
|
1
|
+
Feature: Lookup data from a Database Table
|
2
|
+
|
3
|
+
@database
|
4
|
+
Scenario:
|
5
|
+
Given a blogposts database table with rows:
|
6
|
+
| id | title | author |
|
7
|
+
| 1 | The great tool | Tim Taylor |
|
8
|
+
| 2 | A football anecdote | John Miller |
|
9
|
+
| 3 | Historical music | John Mellencamp |
|
10
|
+
|
11
|
+
And an authors database table with rows:
|
12
|
+
| id | name |
|
13
|
+
| 1 | Tim Taylor |
|
14
|
+
| 2 | John Miller |
|
15
|
+
| 3 | John Mellencamp |
|
16
|
+
|
17
|
+
# @database_table = Itiel::Extract::DatabaseTable.new
|
18
|
+
# @database_table.connection = :test
|
19
|
+
# @database_table.table_name = 'blogposts'
|
20
|
+
And I create a Itiel::Extract::DatabaseTable object for the 'blogposts' table
|
21
|
+
|
22
|
+
# @lookup = Itiel::Lookup::DatabaseTable.new
|
23
|
+
# @lookup.connection = :test
|
24
|
+
# @lookup.table_name = 'author'
|
25
|
+
And I create a Itiel::Lookup::DatabaseTable object for the "authors"
|
26
|
+
|
27
|
+
# @lookup.lookup_columns = { :author => :name }
|
28
|
+
And the lookup joins the "author" column in the source with the "name" column in the lookup stream
|
29
|
+
|
30
|
+
# @lookup.joined_columns = { :id => :author_id }
|
31
|
+
And the lookup appends the "id" column on the lookup stream to the source as "author_id"
|
32
|
+
|
33
|
+
And the data flows in the following direction:
|
34
|
+
| @database_table |
|
35
|
+
| @lookup |
|
36
|
+
|
37
|
+
When I start the source
|
38
|
+
|
39
|
+
Then the output for the lookup should be:
|
40
|
+
| id | title | author | author_id |
|
41
|
+
| 1 | The great tool | Tim Taylor | 1 |
|
42
|
+
| 2 | A football anecdote | John Miller | 2 |
|
43
|
+
| 3 | Historical music | John Mellencamp | 3 |
|
@@ -0,0 +1,19 @@
|
|
1
|
+
Feature: Ruby scripting
|
2
|
+
|
3
|
+
Scenario:
|
4
|
+
Given the following data is in the stream:
|
5
|
+
| id | title | author |
|
6
|
+
| 1 | The great tool | Tim Taylor |
|
7
|
+
| 2 | A football anecdote | John Miller |
|
8
|
+
| 3 | Historical music | John Mellencamp |
|
9
|
+
|
10
|
+
And I create a Itiel::Script::RubyScript object with the following block of code:
|
11
|
+
"""
|
12
|
+
row["slug"] = row["title"].parameterize
|
13
|
+
"""
|
14
|
+
|
15
|
+
Then the resulting output stream should be:
|
16
|
+
| id | title | author | slug |
|
17
|
+
| 1 | The great tool | Tim Taylor | the-great-tool |
|
18
|
+
| 2 | A football anecdote | John Miller | a-football-anecdote |
|
19
|
+
| 3 | Historical music | John Mellencamp | historical-music |
|
@@ -0,0 +1,15 @@
|
|
1
|
+
Given /^a "([^"]*)" file with the following rows:$/ do |filename, table|
|
2
|
+
CSV.open File.join('tmp', filename), "wb" do |csv|
|
3
|
+
table.raw.each { |row| csv << row }
|
4
|
+
end
|
5
|
+
end
|
6
|
+
|
7
|
+
Then /^the "([^"]*)" file should exist with the following content:$/ do |filename, table|
|
8
|
+
expected_file = CSV.generate do |csv|
|
9
|
+
table.raw.each { |row| csv << row }
|
10
|
+
end
|
11
|
+
|
12
|
+
resulting_file = File.new(File.join('tmp', filename), "rb").read
|
13
|
+
|
14
|
+
expect(resulting_file).to eq expected_file
|
15
|
+
end
|