mini_etl 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +18 -0
- data/.rubocop.yml +98 -0
- data/.rubocop_todo.yml +14 -0
- data/CHANGELOG.md +18 -0
- data/Gemfile +6 -0
- data/Gemfile.lock +74 -0
- data/README.md +145 -0
- data/Rakefile +72 -0
- data/bin/console +15 -0
- data/bin/setup +8 -0
- data/lib/mini_etl.rb +27 -0
- data/lib/mini_etl/generator.rb +48 -0
- data/lib/mini_etl/process.rb +49 -0
- data/lib/mini_etl/source.rb +45 -0
- data/lib/mini_etl/strategies/csv_strategy.rb +22 -0
- data/lib/mini_etl/strategy.rb +14 -0
- data/lib/mini_etl/util/status.rb +27 -0
- data/lib/mini_etl/version.rb +5 -0
- data/lib/tasks/support/generation.rb +54 -0
- data/mini_etl.gemspec +45 -0
- metadata +160 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: bf6d9d9f6c520143a0ed4798dae23ac0dd17d6acd338e4c954a650433c111fc5
|
4
|
+
data.tar.gz: 3b2cc9482751be5af528bfa9046eb55dd53a8d8a19eb368f80f775dc6747c72b
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: acecce229c1db06eef8cabf6e2278bcfd4b0da83976e2e30101c6a3a1d0908cc002932ce292ff7489e430aaff95adf6677426072ebe03151194b696efe3e4286
|
7
|
+
data.tar.gz: 159a16e1c228a3b19485534a065b8c84a44bf93e54e1c19316723166d39550f0e3fdfe4a88184e8114854d167e83fa6d225783c97aad10d54e4762f5f894bdaa
|
data/.gitignore
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
# Ignore generated docs
|
2
|
+
/.yardoc
|
3
|
+
/_yardoc
|
4
|
+
/doc
|
5
|
+
README.html
|
6
|
+
|
7
|
+
# Ignore generated files
|
8
|
+
/.bundle
|
9
|
+
/pkg
|
10
|
+
/samples
|
11
|
+
/tmp
|
12
|
+
|
13
|
+
# Ignore test generated files
|
14
|
+
.rspec
|
15
|
+
.rspec_status
|
16
|
+
/coverage
|
17
|
+
/spec/reports
|
18
|
+
spec/examples.txt
|
data/.rubocop.yml
ADDED
@@ -0,0 +1,98 @@
|
|
1
|
+
inherit_from: .rubocop_todo.yml
|
2
|
+
|
3
|
+
AllCops:
|
4
|
+
RubyInterpreters:
|
5
|
+
- ruby
|
6
|
+
# Include common Ruby source files.
|
7
|
+
Include:
|
8
|
+
- '**/*.rb'
|
9
|
+
- '**/*.gemfile'
|
10
|
+
- '**/*.gemspec'
|
11
|
+
- '**/*.jb'
|
12
|
+
- '**/*.jbuilder'
|
13
|
+
- '**/*.rake'
|
14
|
+
- '**/*.spec'
|
15
|
+
Exclude:
|
16
|
+
- 'node_modules/**/*'
|
17
|
+
- 'vendor/**/*'
|
18
|
+
- '.git/**/*'
|
19
|
+
# Default formatter will be used if no `-f/--format` option is given.
|
20
|
+
DefaultFormatter: progress
|
21
|
+
# Cop names are displayed in offense messages by default. Change behavior
|
22
|
+
# by overriding DisplayCopNames, or by giving the `--no-display-cop-names`
|
23
|
+
# option.
|
24
|
+
DisplayCopNames: true
|
25
|
+
# Style guide URLs are not displayed in offense messages by default. Change
|
26
|
+
# behavior by overriding `DisplayStyleGuide`, or by giving the
|
27
|
+
# `-S/--display-style-guide` option.
|
28
|
+
DisplayStyleGuide: false
|
29
|
+
# When specifying style guide URLs, any paths and/or fragments will be
|
30
|
+
# evaluated relative to the base URL.
|
31
|
+
StyleGuideBaseURL: https://github.com/rubocop-hq/ruby-style-guide
|
32
|
+
# Extra details are not displayed in offense messages by default. Change
|
33
|
+
# behavior by overriding ExtraDetails, or by giving the
|
34
|
+
# `-E/--extra-details` option.
|
35
|
+
ExtraDetails: false
|
36
|
+
# Additional cops that do not reference a style guide rule may be enabled by
|
37
|
+
# default. Change behavior by overriding `StyleGuideCopsOnly`, or by giving
|
38
|
+
# the `--only-guide-cops` option.
|
39
|
+
StyleGuideCopsOnly: false
|
40
|
+
# All cops except the ones configured `Enabled: false` in this file are enabled by default.
|
41
|
+
# Change this behavior by overriding either `DisabledByDefault` or `EnabledByDefault`.
|
42
|
+
# When `DisabledByDefault` is `true`, all cops in the default configuration
|
43
|
+
# are disabled, and only cops in user configuration are enabled. This makes
|
44
|
+
# cops opt-in instead of opt-out. Note that when `DisabledByDefault` is `true`,
|
45
|
+
# cops in user configuration will be enabled even if they don't set the
|
46
|
+
# Enabled parameter.
|
47
|
+
# When `EnabledByDefault` is `true`, all cops, even those configured `Enabled: false`
|
48
|
+
# in this file are enabled by default. Cops can still be disabled in user configuration.
|
49
|
+
# Note that it is invalid to set both EnabledByDefault and DisabledByDefault
|
50
|
+
# to true in the same configuration.
|
51
|
+
EnabledByDefault: false
|
52
|
+
DisabledByDefault: false
|
53
|
+
# Enables the result cache if `true`. Can be overridden by the `--cache` command
|
54
|
+
# line option.
|
55
|
+
UseCache: true
|
56
|
+
# Threshold for how many files can be stored in the result cache before some
|
57
|
+
# of the files are automatically removed.
|
58
|
+
MaxFilesInCache: 20000
|
59
|
+
# The cache will be stored in "rubocop_cache" under this directory. If
|
60
|
+
# CacheRootDirectory is ~ (nil), which it is by default, the root will be
|
61
|
+
# taken from the environment variable `$XDG_CACHE_HOME` if it is set, or if
|
62
|
+
# `$XDG_CACHE_HOME` is not set, it will be `$HOME/.cache/`.
|
63
|
+
CacheRootDirectory: ~
|
64
|
+
# It is possible for a malicious user to know the location of RuboCop's cache
|
65
|
+
# directory by looking at CacheRootDirectory, and create a symlink in its
|
66
|
+
# place that could cause RuboCop to overwrite unintended files, or read
|
67
|
+
# malicious input. If you are certain that your cache location is secure from
|
68
|
+
# this kind of attack, and wish to use a symlinked cache location, set this
|
69
|
+
# value to "true".
|
70
|
+
AllowSymlinksInCacheRootDirectory: false
|
71
|
+
# What MRI version of the Ruby interpreter is the inspected code intended to
|
72
|
+
# run on? (If there is more than one, set this to the lowest version.)
|
73
|
+
# If a value is specified for TargetRubyVersion then it is used. Acceptable
|
74
|
+
# values are specificed as a float (i.e. 2.5); the teeny version of Ruby
|
75
|
+
# should not be included. If the project specifies a Ruby version in the
|
76
|
+
# .ruby-version file, Gemfile or gems.rb file, RuboCop will try to determine
|
77
|
+
# the desired version of Ruby by inspecting the .ruby-version file first,
|
78
|
+
# followed by the Gemfile.lock or gems.locked file. (Although the Ruby version
|
79
|
+
# is specified in the Gemfile or gems.rb file, RuboCop reads the final value
|
80
|
+
# from the lock file.) If the Ruby version is still unresolved, RuboCop will
|
81
|
+
# use the oldest officially supported Ruby version (currently Ruby 2.3).
|
82
|
+
TargetRubyVersion: ~
|
83
|
+
# What version of Rails is the inspected code using? If a value is specified
|
84
|
+
# for TargetRailsVersion then it is used. Acceptable values are specificed
|
85
|
+
# as a float (i.e. 5.1); the patch version of Rails should not be included.
|
86
|
+
# If TargetRailsVersion is not set, RuboCop will parse the Gemfile.lock or
|
87
|
+
# gems.locked file to find the version of Rails that has been bound to the
|
88
|
+
# application. If neither of those files exist, RuboCop will use Rails 5.0
|
89
|
+
# as the default.
|
90
|
+
TargetRailsVersion: ~
|
91
|
+
|
92
|
+
Style/FrozenStringLiteralComment:
|
93
|
+
Enabled: false
|
94
|
+
Metrics/LineLength:
|
95
|
+
Max: 150
|
96
|
+
Metrics/BlockLength:
|
97
|
+
Exclude:
|
98
|
+
- 'spec/**/*'
|
data/.rubocop_todo.yml
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
# This configuration was generated by
|
2
|
+
# `rubocop --auto-gen-config`
|
3
|
+
# on 2019-06-04 10:13:24 -0500 using RuboCop version 0.71.0.
|
4
|
+
# The point is for the user to remove these configuration records
|
5
|
+
# one by one as the offenses are removed from the code base.
|
6
|
+
# Note that changes in the inspected code, or installation of new
|
7
|
+
# versions of RuboCop, may require this file to be generated again.
|
8
|
+
|
9
|
+
# Offense count: 1
|
10
|
+
Style/Documentation:
|
11
|
+
Exclude:
|
12
|
+
- 'spec/**/*'
|
13
|
+
- 'test/**/*'
|
14
|
+
- 'lib/etl.rb'
|
data/CHANGELOG.md
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
# Changelog
|
2
|
+
All notable changes to this project will be documented in this file.
|
3
|
+
|
4
|
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
5
|
+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
6
|
+
|
7
|
+
## [Unreleased]
|
8
|
+
|
9
|
+
## [0.1.0] - 2019-06-06
|
10
|
+
|
11
|
+
### Added
|
12
|
+
- [Task] Improve spec coverage
|
13
|
+
- [Base] Introduce Generator for structure generation
|
14
|
+
- [Base] Factor out strategies
|
15
|
+
- [Base] General extraction strategy, still static
|
16
|
+
- [Base] Basic, more useful documentation
|
17
|
+
- [Base] CSV Sample strategy: File generation and sourcing
|
18
|
+
- [Base] Initial application
|
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,74 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
mini_etl (0.2.0)
|
5
|
+
|
6
|
+
GEM
|
7
|
+
remote: https://rubygems.org/
|
8
|
+
specs:
|
9
|
+
ast (2.4.0)
|
10
|
+
coderay (1.1.2)
|
11
|
+
concurrent-ruby (1.1.5)
|
12
|
+
diff-lcs (1.3)
|
13
|
+
docile (1.3.1)
|
14
|
+
faker (1.9.3)
|
15
|
+
i18n (>= 0.7)
|
16
|
+
i18n (1.6.0)
|
17
|
+
concurrent-ruby (~> 1.0)
|
18
|
+
jaro_winkler (1.5.2)
|
19
|
+
json (2.2.0)
|
20
|
+
method_source (0.9.2)
|
21
|
+
parallel (1.17.0)
|
22
|
+
parser (2.6.3.0)
|
23
|
+
ast (~> 2.4.0)
|
24
|
+
pry (0.12.2)
|
25
|
+
coderay (~> 1.1.0)
|
26
|
+
method_source (~> 0.9.0)
|
27
|
+
rainbow (3.0.0)
|
28
|
+
rake (10.5.0)
|
29
|
+
rspec (3.8.0)
|
30
|
+
rspec-core (~> 3.8.0)
|
31
|
+
rspec-expectations (~> 3.8.0)
|
32
|
+
rspec-mocks (~> 3.8.0)
|
33
|
+
rspec-core (3.8.0)
|
34
|
+
rspec-support (~> 3.8.0)
|
35
|
+
rspec-expectations (3.8.3)
|
36
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
37
|
+
rspec-support (~> 3.8.0)
|
38
|
+
rspec-mocks (3.8.0)
|
39
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
40
|
+
rspec-support (~> 3.8.0)
|
41
|
+
rspec-support (3.8.0)
|
42
|
+
rubocop (0.71.0)
|
43
|
+
jaro_winkler (~> 1.5.1)
|
44
|
+
parallel (~> 1.10)
|
45
|
+
parser (>= 2.6)
|
46
|
+
rainbow (>= 2.2.2, < 4.0)
|
47
|
+
ruby-progressbar (~> 1.7)
|
48
|
+
unicode-display_width (>= 1.4.0, < 1.7)
|
49
|
+
ruby-progressbar (1.10.1)
|
50
|
+
simplecov (0.16.1)
|
51
|
+
docile (~> 1.1)
|
52
|
+
json (>= 1.8, < 3)
|
53
|
+
simplecov-html (~> 0.10.0)
|
54
|
+
simplecov-html (0.10.2)
|
55
|
+
unicode-display_width (1.6.0)
|
56
|
+
|
57
|
+
PLATFORMS
|
58
|
+
ruby
|
59
|
+
|
60
|
+
DEPENDENCIES
|
61
|
+
bundler (~> 1.16)
|
62
|
+
faker (~> 1.9)
|
63
|
+
mini_etl!
|
64
|
+
pry (~> 0.12.2)
|
65
|
+
rake (~> 10.0)
|
66
|
+
rspec (~> 3.0)
|
67
|
+
rubocop (~> 0.71.0)
|
68
|
+
simplecov (~> 0.16.1)
|
69
|
+
|
70
|
+
RUBY VERSION
|
71
|
+
ruby 2.6.3p62
|
72
|
+
|
73
|
+
BUNDLED WITH
|
74
|
+
1.17.2
|
data/README.md
ADDED
@@ -0,0 +1,145 @@
|
|
1
|
+
# Mini ETL
|
2
|
+
|
3
|
+
Basic toolkit for Extract/Transform/Load operations. Abstracts the details of
|
4
|
+
performing sourcing, intermediate structure generation and data persistance.
|
5
|
+
|
6
|
+
## Usage
|
7
|
+
|
8
|
+
### Sourcing
|
9
|
+
|
10
|
+
A `MiniEtl` `Process` is kicked off by configuring a process. For a basic CSV file
|
11
|
+
deserialize and bulk load:
|
12
|
+
|
13
|
+
```
|
14
|
+
process = MiniEtl.create_process do |process|
|
15
|
+
process.source.type = :csv
|
16
|
+
process.source.location = 'samples/small.csv'
|
17
|
+
end
|
18
|
+
|
19
|
+
process.bootstrap
|
20
|
+
```
|
21
|
+
|
22
|
+
TODO: Write a strategy for HTTP, use JSON server
|
23
|
+
|
24
|
+
```
|
25
|
+
process = MiniEtl.create_process do |process|
|
26
|
+
process.source.type = :http
|
27
|
+
process.source.location = 'localhost:8080/sample'
|
28
|
+
end
|
29
|
+
|
30
|
+
process.bootstrap
|
31
|
+
```
|
32
|
+
|
33
|
+
Strategies are available for CSV and JSON. If you need something else entirely,
|
34
|
+
a manual source can be used instead:
|
35
|
+
|
36
|
+
```
|
37
|
+
process = MiniEtl.create_process do |process|
|
38
|
+
process.source.type = :manual
|
39
|
+
process.source.method = Proc.new do
|
40
|
+
...
|
41
|
+
end
|
42
|
+
end
|
43
|
+
```
|
44
|
+
|
45
|
+
### Structure generation
|
46
|
+
|
47
|
+
Once data sourcing is complete, data can be fetched in-place.
|
48
|
+
|
49
|
+
```
|
50
|
+
process = MiniEtl.create_process do |process|
|
51
|
+
process.source.type = :csv
|
52
|
+
process.source.location = 'samples/small.csv'
|
53
|
+
end
|
54
|
+
|
55
|
+
process.bootstrap
|
56
|
+
process.generate
|
57
|
+
|
58
|
+
process.generator.structures # intermediate structure for bulk import
|
59
|
+
```
|
60
|
+
|
61
|
+
If the data source is too large to process in memory, an iterator can be given
|
62
|
+
instead:
|
63
|
+
|
64
|
+
```
|
65
|
+
process = MiniEtl.create_process do |process|
|
66
|
+
process.source.type = :csv
|
67
|
+
process.source.location = 'samples/large.csv'
|
68
|
+
process.generator.lazy = true
|
69
|
+
end
|
70
|
+
|
71
|
+
process.bootstrap
|
72
|
+
process.generator.start do |structures|
|
73
|
+
...
|
74
|
+
end
|
75
|
+
```
|
76
|
+
|
77
|
+
### Data persistance
|
78
|
+
|
79
|
+
Finally, once data is shaped the way you need it to, data can be persisted in
|
80
|
+
any kind of way you need it to. The receiver class is expected to respond to
|
81
|
+
`.create(args)`
|
82
|
+
|
83
|
+
```
|
84
|
+
process = MiniEtl.create_process do |process|
|
85
|
+
process.source.type = :csv
|
86
|
+
process.source.location = 'samples/large.csv'
|
87
|
+
process.store.type = Person # An active record model
|
88
|
+
end
|
89
|
+
|
90
|
+
process.bootstrap
|
91
|
+
process.generate
|
92
|
+
process.persist
|
93
|
+
```
|
94
|
+
|
95
|
+
In this way, any arbitrary store can be created,
|
96
|
+
|
97
|
+
```
|
98
|
+
class Payroll
|
99
|
+
Struct.new(:target, :name, :last_name, ...)
|
100
|
+
@@data = []
|
101
|
+
|
102
|
+
def create(params = {})
|
103
|
+
@@data << Struct::Target.new(name: params[:name], last_name: params[:last_name], ...)
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
process = MiniEtl.create_process do |process|
|
108
|
+
process.source.type = :csv
|
109
|
+
process.source.location = 'samples/small.csv'
|
110
|
+
process.store.type = Payroll
|
111
|
+
end
|
112
|
+
|
113
|
+
process.bootstrap
|
114
|
+
process.generate
|
115
|
+
process.persist
|
116
|
+
```
|
117
|
+
|
118
|
+
## Development
|
119
|
+
|
120
|
+
TODO: Test stuff
|
121
|
+
`$ rake`
|
122
|
+
|
123
|
+
Runs rspec, rubocop, generates coverage report
|
124
|
+
|
125
|
+
TODO: Explain how to generate csv files and the rest of the samples
|
126
|
+
NOTE: This will take ~5.5 mins, super slow, would need a parallel version
|
127
|
+
```
|
128
|
+
$ rake sample:csv:all
|
129
|
+
```
|
130
|
+
|
131
|
+
TODO: Explain how to use JSON Server to provide a fake API
|
132
|
+
|
133
|
+
```
|
134
|
+
$ npm install -g json-server
|
135
|
+
$ rake sample:json:small
|
136
|
+
$ json-server --watch samples/small.json --port 3001
|
137
|
+
```
|
138
|
+
|
139
|
+
API is now available at `localhost:3001/payroll`
|
140
|
+
|
141
|
+
...
|
142
|
+
|
143
|
+
## Contributing
|
144
|
+
|
145
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/[USERNAME]/etl.
|
data/Rakefile
ADDED
@@ -0,0 +1,72 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'bundler/gem_tasks'
|
4
|
+
require 'rspec/core/rake_task'
|
5
|
+
require_relative 'lib/tasks/support/generation'
|
6
|
+
|
7
|
+
require 'rubygems'
|
8
|
+
require 'bundler'
|
9
|
+
Bundler.require(:default, :development)
|
10
|
+
|
11
|
+
RSpec::Core::RakeTask.new(:spec)
|
12
|
+
|
13
|
+
task default: :spec_n_cop
|
14
|
+
|
15
|
+
task :spec_n_cop do
|
16
|
+
Rake::Task['spec'].invoke
|
17
|
+
require 'rubocop'
|
18
|
+
cli = RuboCop::CLI.new
|
19
|
+
cli.run
|
20
|
+
end
|
21
|
+
|
22
|
+
namespace :sample do
|
23
|
+
include Support::Generation
|
24
|
+
|
25
|
+
namespace :csv do
|
26
|
+
desc 'Generate CSV samples'
|
27
|
+
task :all do
|
28
|
+
generate_csv(:small)
|
29
|
+
generate_csv(:medium)
|
30
|
+
generate_csv(:large)
|
31
|
+
end
|
32
|
+
|
33
|
+
desc 'Generate a CSV sample, ~1 MB'
|
34
|
+
task :small do
|
35
|
+
generate_csv(:small)
|
36
|
+
end
|
37
|
+
|
38
|
+
desc 'Generate a CSV sample, ~10 MB'
|
39
|
+
task :medium do
|
40
|
+
generate_csv(:medium)
|
41
|
+
end
|
42
|
+
|
43
|
+
desc 'Generate a CSV sample, ~100 MB'
|
44
|
+
task :large do
|
45
|
+
generate_csv(:large)
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
namespace :json do
|
50
|
+
desc 'Generate CSV samples'
|
51
|
+
task :all do
|
52
|
+
generate_json(:small)
|
53
|
+
generate_json(:medium)
|
54
|
+
generate_json(:large)
|
55
|
+
end
|
56
|
+
|
57
|
+
desc 'Generate a JSON sample, ~2 MB'
|
58
|
+
task :small do
|
59
|
+
generate_json(:small)
|
60
|
+
end
|
61
|
+
|
62
|
+
desc 'Generate a JSON sample, ~20 MB'
|
63
|
+
task :medium do
|
64
|
+
generate_json(:medium)
|
65
|
+
end
|
66
|
+
|
67
|
+
desc 'Generate a JSON sample, ~200 MB'
|
68
|
+
task :large do
|
69
|
+
generate_json(:large)
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
data/bin/console
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
4
|
+
require 'bundler/setup'
|
5
|
+
require 'mini_etl'
|
6
|
+
|
7
|
+
# You can add fixtures and/or initialization code here to make experimenting
|
8
|
+
# with your gem easier. You can also use a different console, if you like.
|
9
|
+
|
10
|
+
# (If you use this, don't forget to add pry to your Gemfile!)
|
11
|
+
# require "pry"
|
12
|
+
# Pry.start
|
13
|
+
|
14
|
+
require 'irb'
|
15
|
+
IRB.start(__FILE__)
|
data/bin/setup
ADDED
data/lib/mini_etl.rb
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'mini_etl/version'
|
4
|
+
|
5
|
+
require 'rubygems'
|
6
|
+
require 'bundler'
|
7
|
+
Bundler.require(:default)
|
8
|
+
|
9
|
+
require 'mini_etl/util/status'
|
10
|
+
require 'mini_etl/strategy'
|
11
|
+
require 'mini_etl/strategies/csv_strategy'
|
12
|
+
require 'mini_etl/source'
|
13
|
+
require 'mini_etl/generator'
|
14
|
+
require 'mini_etl/process'
|
15
|
+
|
16
|
+
# Place exception here
|
17
|
+
module MiniEtl
|
18
|
+
class << self
|
19
|
+
def create_process(&block)
|
20
|
+
return nil unless block_given?
|
21
|
+
|
22
|
+
process = Process.new
|
23
|
+
block.call(process)
|
24
|
+
process
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module MiniEtl
|
4
|
+
# Generate record information from a source
|
5
|
+
class Generator
|
6
|
+
VALID_STATES = {
|
7
|
+
initialized: 0,
|
8
|
+
bootstrapped: 1,
|
9
|
+
transformed: 2,
|
10
|
+
failed: 3
|
11
|
+
}.freeze
|
12
|
+
|
13
|
+
include Status
|
14
|
+
|
15
|
+
attr_accessor :lazy, :data
|
16
|
+
attr_writer :type
|
17
|
+
attr_reader :payload
|
18
|
+
|
19
|
+
# TODO: This needs to know the type of the receiver
|
20
|
+
def initialize
|
21
|
+
@lazy = false
|
22
|
+
@payload = []
|
23
|
+
initialized!
|
24
|
+
end
|
25
|
+
|
26
|
+
def bootstrap(type, data)
|
27
|
+
raise ArgumentError if type.nil? || data.nil?
|
28
|
+
|
29
|
+
@type = type
|
30
|
+
@data = data
|
31
|
+
bootstrapped!
|
32
|
+
end
|
33
|
+
|
34
|
+
def transform
|
35
|
+
failed! && return unless bootstrapped?
|
36
|
+
|
37
|
+
strategy = MiniEtl::Strategy.for(@type)
|
38
|
+
@payload = strategy.generate(@data)
|
39
|
+
transformed!
|
40
|
+
end
|
41
|
+
|
42
|
+
# TODO: This needs to feed a block with source data, to stream a structure
|
43
|
+
# generation instead of doing it in place
|
44
|
+
# def start
|
45
|
+
# yield @structure.next if @lazy && block_given?
|
46
|
+
# end
|
47
|
+
end
|
48
|
+
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module MiniEtl
|
4
|
+
# ETL Process wrapper
|
5
|
+
# Should this go all the way?
|
6
|
+
# Probably
|
7
|
+
class Process
|
8
|
+
VALID_STATES = {
|
9
|
+
initialized: 0,
|
10
|
+
bootstrapped: 1,
|
11
|
+
generated: 2,
|
12
|
+
finished: 3,
|
13
|
+
failed: 4
|
14
|
+
}.freeze
|
15
|
+
|
16
|
+
include Status
|
17
|
+
attr_reader :source, :generator
|
18
|
+
|
19
|
+
def initialize
|
20
|
+
@source = Source.new
|
21
|
+
@generator = Generator.new
|
22
|
+
|
23
|
+
initialized!
|
24
|
+
end
|
25
|
+
|
26
|
+
def bootstrap
|
27
|
+
if initialized? && @source.validate
|
28
|
+
@source.fetch
|
29
|
+
@generator.bootstrap(@source.type, @source.payload)
|
30
|
+
bootstrapped!
|
31
|
+
else
|
32
|
+
failed!
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def generate
|
37
|
+
# TODO: Parse it, bear in mind this will be in memory and may need to be split
|
38
|
+
# TODO: Transform it into useful bits
|
39
|
+
# This may be done in rails. Provide a useful interface in that case
|
40
|
+
# TODO: Load the thing wherever it needs to go
|
41
|
+
if bootstrapped?
|
42
|
+
@generator.transform
|
43
|
+
generated!
|
44
|
+
else
|
45
|
+
failed!
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module MiniEtl
|
4
|
+
# Source data from a give type and location
|
5
|
+
class Source
|
6
|
+
VALID_STATES = {
|
7
|
+
initialized: 0,
|
8
|
+
validated: 1,
|
9
|
+
sourced: 2,
|
10
|
+
failed: 3
|
11
|
+
}.freeze
|
12
|
+
|
13
|
+
include Status
|
14
|
+
|
15
|
+
attr_accessor :type, :location, :data
|
16
|
+
attr_reader :payload
|
17
|
+
ACCEPTED_PARAMS = %i[type location data].freeze
|
18
|
+
|
19
|
+
def initialize(params = {})
|
20
|
+
ACCEPTED_PARAMS.each do |param|
|
21
|
+
instance_variable_set "@#{param}".to_sym, params[param]
|
22
|
+
end
|
23
|
+
@payload = []
|
24
|
+
initialized!
|
25
|
+
end
|
26
|
+
|
27
|
+
def validate
|
28
|
+
strategy = MiniEtl::Strategy.for(@type)
|
29
|
+
return false && failed! if strategy.nil?
|
30
|
+
|
31
|
+
strategy.validate(self).tap { |x| x ? validated! : failed! }
|
32
|
+
end
|
33
|
+
|
34
|
+
def fetch
|
35
|
+
strategy = MiniEtl::Strategy.for(@type)
|
36
|
+
|
37
|
+
if strategy && validated?
|
38
|
+
@payload = strategy.fetch(self)
|
39
|
+
sourced!
|
40
|
+
else
|
41
|
+
failed!
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
require 'csv'
|
2
|
+
|
3
|
+
module MiniEtl
|
4
|
+
module Strategies
|
5
|
+
# Extract/Transform strategy for a CSV input
|
6
|
+
class CSVStrategy
|
7
|
+
class << self
|
8
|
+
def validate(source)
|
9
|
+
Pathname.new(source.location).exist?
|
10
|
+
end
|
11
|
+
|
12
|
+
def fetch(source)
|
13
|
+
File.read(source.location)
|
14
|
+
end
|
15
|
+
|
16
|
+
def generate(data)
|
17
|
+
CSV.parse(data)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module MiniEtl
|
4
|
+
# Fetches a Strategy kind class for a given type of data
|
5
|
+
# extraction/transformation
|
6
|
+
class Strategy
|
7
|
+
class << self
|
8
|
+
def for(type)
|
9
|
+
strategy_constant = "#{type.to_s.upcase}Strategy"
|
10
|
+
Strategies.const_get(strategy_constant) if Strategies.const_defined?(strategy_constant)
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module MiniEtl
|
4
|
+
# Track a status
|
5
|
+
module Status
|
6
|
+
DEFAULT_STATES = {
|
7
|
+
initialized: 0,
|
8
|
+
finished: 1,
|
9
|
+
failed: 2
|
10
|
+
}.freeze
|
11
|
+
|
12
|
+
def self.included(base)
|
13
|
+
attr_reader :status
|
14
|
+
|
15
|
+
states = base.const_defined?(:VALID_STATES) ? base.const_get(:VALID_STATES) : DEFAULT_STATES
|
16
|
+
states.each do |verb, value|
|
17
|
+
define_method "#{verb}?".to_sym do
|
18
|
+
@status == value
|
19
|
+
end
|
20
|
+
|
21
|
+
define_method "#{verb}!".to_sym do
|
22
|
+
@status = value
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,54 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Support
|
4
|
+
# Test files generation
|
5
|
+
module Generation
|
6
|
+
# COLUMNS = %w[name last_name nationality origin phone bank iban currency segment].freeze
|
7
|
+
RECORD_SIZE = {
|
8
|
+
small: 8_500,
|
9
|
+
medium: 85_000,
|
10
|
+
large: 825_000
|
11
|
+
}.freeze
|
12
|
+
|
13
|
+
def generate_csv(size)
|
14
|
+
check_dir
|
15
|
+
|
16
|
+
File.open("samples/#{size}.csv", 'w') do |sample_file|
|
17
|
+
RECORD_SIZE[size].times { sample_file.write(dummy_data.join(',')) }
|
18
|
+
sample_file.close
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
def generate_json(size)
|
23
|
+
check_dir
|
24
|
+
|
25
|
+
File.open("samples/#{size}.json", 'w') do |sample_file|
|
26
|
+
sample_file.write('[')
|
27
|
+
RECORD_SIZE[size].pred.times { sample_file.write(json_string + ',') }
|
28
|
+
sample_file.write(json_string + ']')
|
29
|
+
sample_file.close
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
private
|
34
|
+
|
35
|
+
def check_dir
|
36
|
+
Dir.mkdir('samples') unless Dir.exist?('samples')
|
37
|
+
end
|
38
|
+
|
39
|
+
def dummy_names
|
40
|
+
%i[name last_name nationality capital_city phone_number bank iban currency industry]
|
41
|
+
end
|
42
|
+
|
43
|
+
def dummy_data
|
44
|
+
[Faker::Name.first_name, Faker::Name.last_name, Faker::Nation.nationality, Faker::Nation.capital_city,
|
45
|
+
Faker::PhoneNumber.phone_number_with_country_code, Faker::Bank.name, Faker::Bank.iban, Faker::Currency.code, Faker::IndustrySegments.industry]
|
46
|
+
rescue Faker::UniqueGenerator::RetryLimitExceeded
|
47
|
+
Faker::UniqueGenerator.clear
|
48
|
+
end
|
49
|
+
|
50
|
+
def json_string
|
51
|
+
JSON.dump(Hash[dummy_names.zip(dummy_data)])
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
data/mini_etl.gemspec
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
lib = File.expand_path('lib', __dir__)
|
4
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
5
|
+
require 'mini_etl/version'
|
6
|
+
|
7
|
+
Gem::Specification.new do |spec|
|
8
|
+
spec.name = 'mini_etl'
|
9
|
+
spec.version = MiniEtl::VERSION
|
10
|
+
spec.authors = ['Gerardo Galindez']
|
11
|
+
spec.email = ['ggalindezb@gmail.com']
|
12
|
+
|
13
|
+
spec.summary = 'Extract/Transform/Load wrapper'
|
14
|
+
spec.homepage = 'https://github.com/ggalindezb/mini_etl'
|
15
|
+
|
16
|
+
# Prevent pushing this gem to RubyGems.org. To allow pushes either set the 'allowed_push_host'
|
17
|
+
# to allow pushing to a single host or delete this section to allow pushing to any host.
|
18
|
+
# if spec.respond_to?(:metadata)
|
19
|
+
# spec.metadata["allowed_push_host"] = "TODO: Set to 'http://mygemserver.com'"
|
20
|
+
|
21
|
+
# spec.metadata["homepage_uri"] = spec.homepage
|
22
|
+
# spec.metadata["source_code_uri"] = "TODO: Put your gem's public repo URL here."
|
23
|
+
# spec.metadata["changelog_uri"] = "TODO: Put your gem's CHANGELOG.md URL here."
|
24
|
+
# else
|
25
|
+
# raise "RubyGems 2.0 or newer is required to protect against " \
|
26
|
+
# "public gem pushes."
|
27
|
+
# end
|
28
|
+
|
29
|
+
# Specify which files should be added to the gem when it is released.
|
30
|
+
# The `git ls-files -z` loads the files in the RubyGem that have been added into git.
|
31
|
+
spec.files = Dir.chdir(File.expand_path(__dir__)) do
|
32
|
+
`git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
33
|
+
end
|
34
|
+
spec.bindir = 'exe'
|
35
|
+
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
36
|
+
spec.require_paths = ['lib']
|
37
|
+
|
38
|
+
spec.add_development_dependency 'bundler', '~> 1.16'
|
39
|
+
spec.add_development_dependency 'faker', '~> 1.9'
|
40
|
+
spec.add_development_dependency 'pry', '~> 0.12.2'
|
41
|
+
spec.add_development_dependency 'rake', '~> 10.0'
|
42
|
+
spec.add_development_dependency 'rspec', '~> 3.0'
|
43
|
+
spec.add_development_dependency 'rubocop', '~> 0.71.0'
|
44
|
+
spec.add_development_dependency 'simplecov', '~> 0.16.1'
|
45
|
+
end
|
metadata
ADDED
@@ -0,0 +1,160 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: mini_etl
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.2.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Gerardo Galindez
|
8
|
+
autorequire:
|
9
|
+
bindir: exe
|
10
|
+
cert_chain: []
|
11
|
+
date: 2019-06-07 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: bundler
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.16'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.16'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: faker
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '1.9'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '1.9'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: pry
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: 0.12.2
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: 0.12.2
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: rake
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '10.0'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '10.0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: rspec
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '3.0'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '3.0'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: rubocop
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - "~>"
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: 0.71.0
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - "~>"
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: 0.71.0
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: simplecov
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - "~>"
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: 0.16.1
|
104
|
+
type: :development
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - "~>"
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: 0.16.1
|
111
|
+
description:
|
112
|
+
email:
|
113
|
+
- ggalindezb@gmail.com
|
114
|
+
executables: []
|
115
|
+
extensions: []
|
116
|
+
extra_rdoc_files: []
|
117
|
+
files:
|
118
|
+
- ".gitignore"
|
119
|
+
- ".rubocop.yml"
|
120
|
+
- ".rubocop_todo.yml"
|
121
|
+
- CHANGELOG.md
|
122
|
+
- Gemfile
|
123
|
+
- Gemfile.lock
|
124
|
+
- README.md
|
125
|
+
- Rakefile
|
126
|
+
- bin/console
|
127
|
+
- bin/setup
|
128
|
+
- lib/mini_etl.rb
|
129
|
+
- lib/mini_etl/generator.rb
|
130
|
+
- lib/mini_etl/process.rb
|
131
|
+
- lib/mini_etl/source.rb
|
132
|
+
- lib/mini_etl/strategies/csv_strategy.rb
|
133
|
+
- lib/mini_etl/strategy.rb
|
134
|
+
- lib/mini_etl/util/status.rb
|
135
|
+
- lib/mini_etl/version.rb
|
136
|
+
- lib/tasks/support/generation.rb
|
137
|
+
- mini_etl.gemspec
|
138
|
+
homepage: https://github.com/ggalindezb/mini_etl
|
139
|
+
licenses: []
|
140
|
+
metadata: {}
|
141
|
+
post_install_message:
|
142
|
+
rdoc_options: []
|
143
|
+
require_paths:
|
144
|
+
- lib
|
145
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
146
|
+
requirements:
|
147
|
+
- - ">="
|
148
|
+
- !ruby/object:Gem::Version
|
149
|
+
version: '0'
|
150
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
151
|
+
requirements:
|
152
|
+
- - ">="
|
153
|
+
- !ruby/object:Gem::Version
|
154
|
+
version: '0'
|
155
|
+
requirements: []
|
156
|
+
rubygems_version: 3.0.3
|
157
|
+
signing_key:
|
158
|
+
specification_version: 4
|
159
|
+
summary: Extract/Transform/Load wrapper
|
160
|
+
test_files: []
|