mini_etl 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +18 -0
- data/.rubocop.yml +98 -0
- data/.rubocop_todo.yml +14 -0
- data/CHANGELOG.md +18 -0
- data/Gemfile +6 -0
- data/Gemfile.lock +74 -0
- data/README.md +145 -0
- data/Rakefile +72 -0
- data/bin/console +15 -0
- data/bin/setup +8 -0
- data/lib/mini_etl.rb +27 -0
- data/lib/mini_etl/generator.rb +48 -0
- data/lib/mini_etl/process.rb +49 -0
- data/lib/mini_etl/source.rb +45 -0
- data/lib/mini_etl/strategies/csv_strategy.rb +22 -0
- data/lib/mini_etl/strategy.rb +14 -0
- data/lib/mini_etl/util/status.rb +27 -0
- data/lib/mini_etl/version.rb +5 -0
- data/lib/tasks/support/generation.rb +54 -0
- data/mini_etl.gemspec +45 -0
- metadata +160 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: bf6d9d9f6c520143a0ed4798dae23ac0dd17d6acd338e4c954a650433c111fc5
|
4
|
+
data.tar.gz: 3b2cc9482751be5af528bfa9046eb55dd53a8d8a19eb368f80f775dc6747c72b
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: acecce229c1db06eef8cabf6e2278bcfd4b0da83976e2e30101c6a3a1d0908cc002932ce292ff7489e430aaff95adf6677426072ebe03151194b696efe3e4286
|
7
|
+
data.tar.gz: 159a16e1c228a3b19485534a065b8c84a44bf93e54e1c19316723166d39550f0e3fdfe4a88184e8114854d167e83fa6d225783c97aad10d54e4762f5f894bdaa
|
data/.gitignore
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
# Ignore generated docs
|
2
|
+
/.yardoc
|
3
|
+
/_yardoc
|
4
|
+
/doc
|
5
|
+
README.html
|
6
|
+
|
7
|
+
# Ignore generated files
|
8
|
+
/.bundle
|
9
|
+
/pkg
|
10
|
+
/samples
|
11
|
+
/tmp
|
12
|
+
|
13
|
+
# Ignore test generated files
|
14
|
+
.rspec
|
15
|
+
.rspec_status
|
16
|
+
/coverage
|
17
|
+
/spec/reports
|
18
|
+
spec/examples.txt
|
data/.rubocop.yml
ADDED
@@ -0,0 +1,98 @@
|
|
1
|
+
inherit_from: .rubocop_todo.yml
|
2
|
+
|
3
|
+
AllCops:
|
4
|
+
RubyInterpreters:
|
5
|
+
- ruby
|
6
|
+
# Include common Ruby source files.
|
7
|
+
Include:
|
8
|
+
- '**/*.rb'
|
9
|
+
- '**/*.gemfile'
|
10
|
+
- '**/*.gemspec'
|
11
|
+
- '**/*.jb'
|
12
|
+
- '**/*.jbuilder'
|
13
|
+
- '**/*.rake'
|
14
|
+
- '**/*.spec'
|
15
|
+
Exclude:
|
16
|
+
- 'node_modules/**/*'
|
17
|
+
- 'vendor/**/*'
|
18
|
+
- '.git/**/*'
|
19
|
+
# Default formatter will be used if no `-f/--format` option is given.
|
20
|
+
DefaultFormatter: progress
|
21
|
+
# Cop names are displayed in offense messages by default. Change behavior
|
22
|
+
# by overriding DisplayCopNames, or by giving the `--no-display-cop-names`
|
23
|
+
# option.
|
24
|
+
DisplayCopNames: true
|
25
|
+
# Style guide URLs are not displayed in offense messages by default. Change
|
26
|
+
# behavior by overriding `DisplayStyleGuide`, or by giving the
|
27
|
+
# `-S/--display-style-guide` option.
|
28
|
+
DisplayStyleGuide: false
|
29
|
+
# When specifying style guide URLs, any paths and/or fragments will be
|
30
|
+
# evaluated relative to the base URL.
|
31
|
+
StyleGuideBaseURL: https://github.com/rubocop-hq/ruby-style-guide
|
32
|
+
# Extra details are not displayed in offense messages by default. Change
|
33
|
+
# behavior by overriding ExtraDetails, or by giving the
|
34
|
+
# `-E/--extra-details` option.
|
35
|
+
ExtraDetails: false
|
36
|
+
# Additional cops that do not reference a style guide rule may be enabled by
|
37
|
+
# default. Change behavior by overriding `StyleGuideCopsOnly`, or by giving
|
38
|
+
# the `--only-guide-cops` option.
|
39
|
+
StyleGuideCopsOnly: false
|
40
|
+
# All cops except the ones configured `Enabled: false` in this file are enabled by default.
|
41
|
+
# Change this behavior by overriding either `DisabledByDefault` or `EnabledByDefault`.
|
42
|
+
# When `DisabledByDefault` is `true`, all cops in the default configuration
|
43
|
+
# are disabled, and only cops in user configuration are enabled. This makes
|
44
|
+
# cops opt-in instead of opt-out. Note that when `DisabledByDefault` is `true`,
|
45
|
+
# cops in user configuration will be enabled even if they don't set the
|
46
|
+
# Enabled parameter.
|
47
|
+
# When `EnabledByDefault` is `true`, all cops, even those configured `Enabled: false`
|
48
|
+
# in this file are enabled by default. Cops can still be disabled in user configuration.
|
49
|
+
# Note that it is invalid to set both EnabledByDefault and DisabledByDefault
|
50
|
+
# to true in the same configuration.
|
51
|
+
EnabledByDefault: false
|
52
|
+
DisabledByDefault: false
|
53
|
+
# Enables the result cache if `true`. Can be overridden by the `--cache` command
|
54
|
+
# line option.
|
55
|
+
UseCache: true
|
56
|
+
# Threshold for how many files can be stored in the result cache before some
|
57
|
+
# of the files are automatically removed.
|
58
|
+
MaxFilesInCache: 20000
|
59
|
+
# The cache will be stored in "rubocop_cache" under this directory. If
|
60
|
+
# CacheRootDirectory is ~ (nil), which it is by default, the root will be
|
61
|
+
# taken from the environment variable `$XDG_CACHE_HOME` if it is set, or if
|
62
|
+
# `$XDG_CACHE_HOME` is not set, it will be `$HOME/.cache/`.
|
63
|
+
CacheRootDirectory: ~
|
64
|
+
# It is possible for a malicious user to know the location of RuboCop's cache
|
65
|
+
# directory by looking at CacheRootDirectory, and create a symlink in its
|
66
|
+
# place that could cause RuboCop to overwrite unintended files, or read
|
67
|
+
# malicious input. If you are certain that your cache location is secure from
|
68
|
+
# this kind of attack, and wish to use a symlinked cache location, set this
|
69
|
+
# value to "true".
|
70
|
+
AllowSymlinksInCacheRootDirectory: false
|
71
|
+
# What MRI version of the Ruby interpreter is the inspected code intended to
|
72
|
+
# run on? (If there is more than one, set this to the lowest version.)
|
73
|
+
# If a value is specified for TargetRubyVersion then it is used. Acceptable
|
74
|
+
# values are specificed as a float (i.e. 2.5); the teeny version of Ruby
|
75
|
+
# should not be included. If the project specifies a Ruby version in the
|
76
|
+
# .ruby-version file, Gemfile or gems.rb file, RuboCop will try to determine
|
77
|
+
# the desired version of Ruby by inspecting the .ruby-version file first,
|
78
|
+
# followed by the Gemfile.lock or gems.locked file. (Although the Ruby version
|
79
|
+
# is specified in the Gemfile or gems.rb file, RuboCop reads the final value
|
80
|
+
# from the lock file.) If the Ruby version is still unresolved, RuboCop will
|
81
|
+
# use the oldest officially supported Ruby version (currently Ruby 2.3).
|
82
|
+
TargetRubyVersion: ~
|
83
|
+
# What version of Rails is the inspected code using? If a value is specified
|
84
|
+
# for TargetRailsVersion then it is used. Acceptable values are specificed
|
85
|
+
# as a float (i.e. 5.1); the patch version of Rails should not be included.
|
86
|
+
# If TargetRailsVersion is not set, RuboCop will parse the Gemfile.lock or
|
87
|
+
# gems.locked file to find the version of Rails that has been bound to the
|
88
|
+
# application. If neither of those files exist, RuboCop will use Rails 5.0
|
89
|
+
# as the default.
|
90
|
+
TargetRailsVersion: ~
|
91
|
+
|
92
|
+
Style/FrozenStringLiteralComment:
|
93
|
+
Enabled: false
|
94
|
+
Metrics/LineLength:
|
95
|
+
Max: 150
|
96
|
+
Metrics/BlockLength:
|
97
|
+
Exclude:
|
98
|
+
- 'spec/**/*'
|
data/.rubocop_todo.yml
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
# This configuration was generated by
|
2
|
+
# `rubocop --auto-gen-config`
|
3
|
+
# on 2019-06-04 10:13:24 -0500 using RuboCop version 0.71.0.
|
4
|
+
# The point is for the user to remove these configuration records
|
5
|
+
# one by one as the offenses are removed from the code base.
|
6
|
+
# Note that changes in the inspected code, or installation of new
|
7
|
+
# versions of RuboCop, may require this file to be generated again.
|
8
|
+
|
9
|
+
# Offense count: 1
|
10
|
+
Style/Documentation:
|
11
|
+
Exclude:
|
12
|
+
- 'spec/**/*'
|
13
|
+
- 'test/**/*'
|
14
|
+
- 'lib/etl.rb'
|
data/CHANGELOG.md
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
# Changelog
|
2
|
+
All notable changes to this project will be documented in this file.
|
3
|
+
|
4
|
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
5
|
+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
6
|
+
|
7
|
+
## [Unreleased]
|
8
|
+
|
9
|
+
## [0.1.0] - 2019-06-06
|
10
|
+
|
11
|
+
### Added
|
12
|
+
- [Task] Improve spec coverage
|
13
|
+
- [Base] Introduce Generator for structure generation
|
14
|
+
- [Base] Factor out strategies
|
15
|
+
- [Base] General extraction strategy, still static
|
16
|
+
- [Base] Basic, more useful documentation
|
17
|
+
- [Base] CSV Sample strategy: File generation and sourcing
|
18
|
+
- [Base] Initial application
|
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,74 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
mini_etl (0.2.0)
|
5
|
+
|
6
|
+
GEM
|
7
|
+
remote: https://rubygems.org/
|
8
|
+
specs:
|
9
|
+
ast (2.4.0)
|
10
|
+
coderay (1.1.2)
|
11
|
+
concurrent-ruby (1.1.5)
|
12
|
+
diff-lcs (1.3)
|
13
|
+
docile (1.3.1)
|
14
|
+
faker (1.9.3)
|
15
|
+
i18n (>= 0.7)
|
16
|
+
i18n (1.6.0)
|
17
|
+
concurrent-ruby (~> 1.0)
|
18
|
+
jaro_winkler (1.5.2)
|
19
|
+
json (2.2.0)
|
20
|
+
method_source (0.9.2)
|
21
|
+
parallel (1.17.0)
|
22
|
+
parser (2.6.3.0)
|
23
|
+
ast (~> 2.4.0)
|
24
|
+
pry (0.12.2)
|
25
|
+
coderay (~> 1.1.0)
|
26
|
+
method_source (~> 0.9.0)
|
27
|
+
rainbow (3.0.0)
|
28
|
+
rake (10.5.0)
|
29
|
+
rspec (3.8.0)
|
30
|
+
rspec-core (~> 3.8.0)
|
31
|
+
rspec-expectations (~> 3.8.0)
|
32
|
+
rspec-mocks (~> 3.8.0)
|
33
|
+
rspec-core (3.8.0)
|
34
|
+
rspec-support (~> 3.8.0)
|
35
|
+
rspec-expectations (3.8.3)
|
36
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
37
|
+
rspec-support (~> 3.8.0)
|
38
|
+
rspec-mocks (3.8.0)
|
39
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
40
|
+
rspec-support (~> 3.8.0)
|
41
|
+
rspec-support (3.8.0)
|
42
|
+
rubocop (0.71.0)
|
43
|
+
jaro_winkler (~> 1.5.1)
|
44
|
+
parallel (~> 1.10)
|
45
|
+
parser (>= 2.6)
|
46
|
+
rainbow (>= 2.2.2, < 4.0)
|
47
|
+
ruby-progressbar (~> 1.7)
|
48
|
+
unicode-display_width (>= 1.4.0, < 1.7)
|
49
|
+
ruby-progressbar (1.10.1)
|
50
|
+
simplecov (0.16.1)
|
51
|
+
docile (~> 1.1)
|
52
|
+
json (>= 1.8, < 3)
|
53
|
+
simplecov-html (~> 0.10.0)
|
54
|
+
simplecov-html (0.10.2)
|
55
|
+
unicode-display_width (1.6.0)
|
56
|
+
|
57
|
+
PLATFORMS
|
58
|
+
ruby
|
59
|
+
|
60
|
+
DEPENDENCIES
|
61
|
+
bundler (~> 1.16)
|
62
|
+
faker (~> 1.9)
|
63
|
+
mini_etl!
|
64
|
+
pry (~> 0.12.2)
|
65
|
+
rake (~> 10.0)
|
66
|
+
rspec (~> 3.0)
|
67
|
+
rubocop (~> 0.71.0)
|
68
|
+
simplecov (~> 0.16.1)
|
69
|
+
|
70
|
+
RUBY VERSION
|
71
|
+
ruby 2.6.3p62
|
72
|
+
|
73
|
+
BUNDLED WITH
|
74
|
+
1.17.2
|
data/README.md
ADDED
@@ -0,0 +1,145 @@
|
|
1
|
+
# Mini ETL
|
2
|
+
|
3
|
+
Basic toolkit for Extract/Transform/Load operations. Abstracts the details of
|
4
|
+
performing sourcing, intermediate structure generation and data persistance.
|
5
|
+
|
6
|
+
## Usage
|
7
|
+
|
8
|
+
### Sourcing
|
9
|
+
|
10
|
+
A `MiniEtl` `Process` is kicked off by configuring a process. For a basic CSV file
|
11
|
+
deserialize and bulk load:
|
12
|
+
|
13
|
+
```
|
14
|
+
process = MiniEtl.create_process do |process|
|
15
|
+
process.source.type = :csv
|
16
|
+
process.source.location = 'samples/small.csv'
|
17
|
+
end
|
18
|
+
|
19
|
+
process.bootstrap
|
20
|
+
```
|
21
|
+
|
22
|
+
TODO: Write a strategy for HTTP, use JSON server
|
23
|
+
|
24
|
+
```
|
25
|
+
process = MiniEtl.create_process do |process|
|
26
|
+
process.source.type = :http
|
27
|
+
process.source.location = 'localhost:8080/sample'
|
28
|
+
end
|
29
|
+
|
30
|
+
process.bootstrap
|
31
|
+
```
|
32
|
+
|
33
|
+
Strategies are available for CSV and JSON. If you need something else entirely,
|
34
|
+
a manual source can be used instead:
|
35
|
+
|
36
|
+
```
|
37
|
+
process = MiniEtl.create_process do |process|
|
38
|
+
process.source.type = :manual
|
39
|
+
process.source.method = Proc.new do
|
40
|
+
...
|
41
|
+
end
|
42
|
+
end
|
43
|
+
```
|
44
|
+
|
45
|
+
### Structure generation
|
46
|
+
|
47
|
+
Once data sourcing is complete, data can be fetched in-place.
|
48
|
+
|
49
|
+
```
|
50
|
+
process = MiniEtl.create_process do |process|
|
51
|
+
process.source.type = :csv
|
52
|
+
process.source.location = 'samples/small.csv'
|
53
|
+
end
|
54
|
+
|
55
|
+
process.bootstrap
|
56
|
+
process.generate
|
57
|
+
|
58
|
+
process.generator.structures # intermediate structure for bulk import
|
59
|
+
```
|
60
|
+
|
61
|
+
If the data source is too large to process in memory, an iterator can be given
|
62
|
+
instead:
|
63
|
+
|
64
|
+
```
|
65
|
+
process = MiniEtl.create_process do |process|
|
66
|
+
process.source.type = :csv
|
67
|
+
process.source.location = 'samples/large.csv'
|
68
|
+
process.generator.lazy = true
|
69
|
+
end
|
70
|
+
|
71
|
+
process.bootstrap
|
72
|
+
process.generator.start do |structures|
|
73
|
+
...
|
74
|
+
end
|
75
|
+
```
|
76
|
+
|
77
|
+
### Data persistance
|
78
|
+
|
79
|
+
Finally, once data is shaped the way you need it to, data can be persisted in
|
80
|
+
any kind of way you need it to. The receiver class is expected to respond to
|
81
|
+
`.create(args)`
|
82
|
+
|
83
|
+
```
|
84
|
+
process = MiniEtl.create_process do |process|
|
85
|
+
process.source.type = :csv
|
86
|
+
process.source.location = 'samples/large.csv'
|
87
|
+
process.store.type = Person # An active record model
|
88
|
+
end
|
89
|
+
|
90
|
+
process.bootstrap
|
91
|
+
process.generate
|
92
|
+
process.persist
|
93
|
+
```
|
94
|
+
|
95
|
+
In this way, any arbitrary store can be created,
|
96
|
+
|
97
|
+
```
|
98
|
+
class Payroll
|
99
|
+
Struct.new(:target, :name, :last_name, ...)
|
100
|
+
@@data = []
|
101
|
+
|
102
|
+
def create(params = {})
|
103
|
+
@@data << Struct::Target.new(name: params[:name], last_name: params[:last_name], ...)
|
104
|
+
end
|
105
|
+
end
|
106
|
+
|
107
|
+
process = MiniEtl.create_process do |process|
|
108
|
+
process.source.type = :csv
|
109
|
+
process.source.location = 'samples/small.csv'
|
110
|
+
process.store.type = Payroll
|
111
|
+
end
|
112
|
+
|
113
|
+
process.bootstrap
|
114
|
+
process.generate
|
115
|
+
process.persist
|
116
|
+
```
|
117
|
+
|
118
|
+
## Development
|
119
|
+
|
120
|
+
TODO: Test stuff
|
121
|
+
`$ rake`
|
122
|
+
|
123
|
+
Runs rspec, rubocop, generates coverage report
|
124
|
+
|
125
|
+
TODO: Explain how to generate csv files and the rest of the samples
|
126
|
+
NOTE: This will take ~5.5 mins, super slow, would need a parallel version
|
127
|
+
```
|
128
|
+
$ rake sample:csv:all
|
129
|
+
```
|
130
|
+
|
131
|
+
TODO: Explain how to use JSON Server to provide a fake API
|
132
|
+
|
133
|
+
```
|
134
|
+
$ npm install -g json-server
|
135
|
+
$ rake sample:json:small
|
136
|
+
$ json-server --watch samples/small.json --port 3001
|
137
|
+
```
|
138
|
+
|
139
|
+
API is now available at `localhost:3001/payroll`
|
140
|
+
|
141
|
+
...
|
142
|
+
|
143
|
+
## Contributing
|
144
|
+
|
145
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/[USERNAME]/etl.
|
data/Rakefile
ADDED
@@ -0,0 +1,72 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'bundler/gem_tasks'
|
4
|
+
require 'rspec/core/rake_task'
|
5
|
+
require_relative 'lib/tasks/support/generation'
|
6
|
+
|
7
|
+
require 'rubygems'
|
8
|
+
require 'bundler'
|
9
|
+
Bundler.require(:default, :development)
|
10
|
+
|
11
|
+
RSpec::Core::RakeTask.new(:spec)
|
12
|
+
|
13
|
+
task default: :spec_n_cop
|
14
|
+
|
15
|
+
task :spec_n_cop do
|
16
|
+
Rake::Task['spec'].invoke
|
17
|
+
require 'rubocop'
|
18
|
+
cli = RuboCop::CLI.new
|
19
|
+
cli.run
|
20
|
+
end
|
21
|
+
|
22
|
+
namespace :sample do
|
23
|
+
include Support::Generation
|
24
|
+
|
25
|
+
namespace :csv do
|
26
|
+
desc 'Generate CSV samples'
|
27
|
+
task :all do
|
28
|
+
generate_csv(:small)
|
29
|
+
generate_csv(:medium)
|
30
|
+
generate_csv(:large)
|
31
|
+
end
|
32
|
+
|
33
|
+
desc 'Generate a CSV sample, ~1 MB'
|
34
|
+
task :small do
|
35
|
+
generate_csv(:small)
|
36
|
+
end
|
37
|
+
|
38
|
+
desc 'Generate a CSV sample, ~10 MB'
|
39
|
+
task :medium do
|
40
|
+
generate_csv(:medium)
|
41
|
+
end
|
42
|
+
|
43
|
+
desc 'Generate a CSV sample, ~100 MB'
|
44
|
+
task :large do
|
45
|
+
generate_csv(:large)
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
namespace :json do
|
50
|
+
desc 'Generate CSV samples'
|
51
|
+
task :all do
|
52
|
+
generate_json(:small)
|
53
|
+
generate_json(:medium)
|
54
|
+
generate_json(:large)
|
55
|
+
end
|
56
|
+
|
57
|
+
desc 'Generate a JSON sample, ~2 MB'
|
58
|
+
task :small do
|
59
|
+
generate_json(:small)
|
60
|
+
end
|
61
|
+
|
62
|
+
desc 'Generate a JSON sample, ~20 MB'
|
63
|
+
task :medium do
|
64
|
+
generate_json(:medium)
|
65
|
+
end
|
66
|
+
|
67
|
+
desc 'Generate a JSON sample, ~200 MB'
|
68
|
+
task :large do
|
69
|
+
generate_json(:large)
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
data/bin/console
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
4
|
+
require 'bundler/setup'
|
5
|
+
require 'mini_etl'
|
6
|
+
|
7
|
+
# You can add fixtures and/or initialization code here to make experimenting
|
8
|
+
# with your gem easier. You can also use a different console, if you like.
|
9
|
+
|
10
|
+
# (If you use this, don't forget to add pry to your Gemfile!)
|
11
|
+
# require "pry"
|
12
|
+
# Pry.start
|
13
|
+
|
14
|
+
require 'irb'
|
15
|
+
IRB.start(__FILE__)
|
data/bin/setup
ADDED
data/lib/mini_etl.rb
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'mini_etl/version'
|
4
|
+
|
5
|
+
require 'rubygems'
|
6
|
+
require 'bundler'
|
7
|
+
Bundler.require(:default)
|
8
|
+
|
9
|
+
require 'mini_etl/util/status'
|
10
|
+
require 'mini_etl/strategy'
|
11
|
+
require 'mini_etl/strategies/csv_strategy'
|
12
|
+
require 'mini_etl/source'
|
13
|
+
require 'mini_etl/generator'
|
14
|
+
require 'mini_etl/process'
|
15
|
+
|
16
|
+
# Place exception here
|
17
|
+
module MiniEtl
|
18
|
+
class << self
|
19
|
+
def create_process(&block)
|
20
|
+
return nil unless block_given?
|
21
|
+
|
22
|
+
process = Process.new
|
23
|
+
block.call(process)
|
24
|
+
process
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module MiniEtl
|
4
|
+
# Generate record information from a source
|
5
|
+
class Generator
|
6
|
+
VALID_STATES = {
|
7
|
+
initialized: 0,
|
8
|
+
bootstrapped: 1,
|
9
|
+
transformed: 2,
|
10
|
+
failed: 3
|
11
|
+
}.freeze
|
12
|
+
|
13
|
+
include Status
|
14
|
+
|
15
|
+
attr_accessor :lazy, :data
|
16
|
+
attr_writer :type
|
17
|
+
attr_reader :payload
|
18
|
+
|
19
|
+
# TODO: This needs to know the type of the receiver
|
20
|
+
def initialize
|
21
|
+
@lazy = false
|
22
|
+
@payload = []
|
23
|
+
initialized!
|
24
|
+
end
|
25
|
+
|
26
|
+
def bootstrap(type, data)
|
27
|
+
raise ArgumentError if type.nil? || data.nil?
|
28
|
+
|
29
|
+
@type = type
|
30
|
+
@data = data
|
31
|
+
bootstrapped!
|
32
|
+
end
|
33
|
+
|
34
|
+
def transform
|
35
|
+
failed! && return unless bootstrapped?
|
36
|
+
|
37
|
+
strategy = MiniEtl::Strategy.for(@type)
|
38
|
+
@payload = strategy.generate(@data)
|
39
|
+
transformed!
|
40
|
+
end
|
41
|
+
|
42
|
+
# TODO: This needs to feed a block with source data, to stream a structure
|
43
|
+
# generation instead of doing it in place
|
44
|
+
# def start
|
45
|
+
# yield @structure.next if @lazy && block_given?
|
46
|
+
# end
|
47
|
+
end
|
48
|
+
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module MiniEtl
|
4
|
+
# ETL Process wrapper
|
5
|
+
# Should this go all the way?
|
6
|
+
# Probably
|
7
|
+
class Process
|
8
|
+
VALID_STATES = {
|
9
|
+
initialized: 0,
|
10
|
+
bootstrapped: 1,
|
11
|
+
generated: 2,
|
12
|
+
finished: 3,
|
13
|
+
failed: 4
|
14
|
+
}.freeze
|
15
|
+
|
16
|
+
include Status
|
17
|
+
attr_reader :source, :generator
|
18
|
+
|
19
|
+
def initialize
|
20
|
+
@source = Source.new
|
21
|
+
@generator = Generator.new
|
22
|
+
|
23
|
+
initialized!
|
24
|
+
end
|
25
|
+
|
26
|
+
def bootstrap
|
27
|
+
if initialized? && @source.validate
|
28
|
+
@source.fetch
|
29
|
+
@generator.bootstrap(@source.type, @source.payload)
|
30
|
+
bootstrapped!
|
31
|
+
else
|
32
|
+
failed!
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
def generate
|
37
|
+
# TODO: Parse it, bear in mind this will be in memory and may need to be split
|
38
|
+
# TODO: Transform it into useful bits
|
39
|
+
# This may be done in rails. Provide a useful interface in that case
|
40
|
+
# TODO: Load the thing wherever it needs to go
|
41
|
+
if bootstrapped?
|
42
|
+
@generator.transform
|
43
|
+
generated!
|
44
|
+
else
|
45
|
+
failed!
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module MiniEtl
|
4
|
+
# Source data from a give type and location
|
5
|
+
class Source
|
6
|
+
VALID_STATES = {
|
7
|
+
initialized: 0,
|
8
|
+
validated: 1,
|
9
|
+
sourced: 2,
|
10
|
+
failed: 3
|
11
|
+
}.freeze
|
12
|
+
|
13
|
+
include Status
|
14
|
+
|
15
|
+
attr_accessor :type, :location, :data
|
16
|
+
attr_reader :payload
|
17
|
+
ACCEPTED_PARAMS = %i[type location data].freeze
|
18
|
+
|
19
|
+
def initialize(params = {})
|
20
|
+
ACCEPTED_PARAMS.each do |param|
|
21
|
+
instance_variable_set "@#{param}".to_sym, params[param]
|
22
|
+
end
|
23
|
+
@payload = []
|
24
|
+
initialized!
|
25
|
+
end
|
26
|
+
|
27
|
+
def validate
|
28
|
+
strategy = MiniEtl::Strategy.for(@type)
|
29
|
+
return false && failed! if strategy.nil?
|
30
|
+
|
31
|
+
strategy.validate(self).tap { |x| x ? validated! : failed! }
|
32
|
+
end
|
33
|
+
|
34
|
+
def fetch
|
35
|
+
strategy = MiniEtl::Strategy.for(@type)
|
36
|
+
|
37
|
+
if strategy && validated?
|
38
|
+
@payload = strategy.fetch(self)
|
39
|
+
sourced!
|
40
|
+
else
|
41
|
+
failed!
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
require 'csv'
|
2
|
+
|
3
|
+
module MiniEtl
|
4
|
+
module Strategies
|
5
|
+
# Extract/Transform strategy for a CSV input
|
6
|
+
class CSVStrategy
|
7
|
+
class << self
|
8
|
+
def validate(source)
|
9
|
+
Pathname.new(source.location).exist?
|
10
|
+
end
|
11
|
+
|
12
|
+
def fetch(source)
|
13
|
+
File.read(source.location)
|
14
|
+
end
|
15
|
+
|
16
|
+
def generate(data)
|
17
|
+
CSV.parse(data)
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,14 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module MiniEtl
|
4
|
+
# Fetches a Strategy kind class for a given type of data
|
5
|
+
# extraction/transformation
|
6
|
+
class Strategy
|
7
|
+
class << self
|
8
|
+
def for(type)
|
9
|
+
strategy_constant = "#{type.to_s.upcase}Strategy"
|
10
|
+
Strategies.const_get(strategy_constant) if Strategies.const_defined?(strategy_constant)
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module MiniEtl
|
4
|
+
# Track a status
|
5
|
+
module Status
|
6
|
+
DEFAULT_STATES = {
|
7
|
+
initialized: 0,
|
8
|
+
finished: 1,
|
9
|
+
failed: 2
|
10
|
+
}.freeze
|
11
|
+
|
12
|
+
def self.included(base)
|
13
|
+
attr_reader :status
|
14
|
+
|
15
|
+
states = base.const_defined?(:VALID_STATES) ? base.const_get(:VALID_STATES) : DEFAULT_STATES
|
16
|
+
states.each do |verb, value|
|
17
|
+
define_method "#{verb}?".to_sym do
|
18
|
+
@status == value
|
19
|
+
end
|
20
|
+
|
21
|
+
define_method "#{verb}!".to_sym do
|
22
|
+
@status = value
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,54 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Support
|
4
|
+
# Test files generation
|
5
|
+
module Generation
|
6
|
+
# COLUMNS = %w[name last_name nationality origin phone bank iban currency segment].freeze
|
7
|
+
RECORD_SIZE = {
|
8
|
+
small: 8_500,
|
9
|
+
medium: 85_000,
|
10
|
+
large: 825_000
|
11
|
+
}.freeze
|
12
|
+
|
13
|
+
def generate_csv(size)
|
14
|
+
check_dir
|
15
|
+
|
16
|
+
File.open("samples/#{size}.csv", 'w') do |sample_file|
|
17
|
+
RECORD_SIZE[size].times { sample_file.write(dummy_data.join(',')) }
|
18
|
+
sample_file.close
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
def generate_json(size)
|
23
|
+
check_dir
|
24
|
+
|
25
|
+
File.open("samples/#{size}.json", 'w') do |sample_file|
|
26
|
+
sample_file.write('[')
|
27
|
+
RECORD_SIZE[size].pred.times { sample_file.write(json_string + ',') }
|
28
|
+
sample_file.write(json_string + ']')
|
29
|
+
sample_file.close
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
private
|
34
|
+
|
35
|
+
def check_dir
|
36
|
+
Dir.mkdir('samples') unless Dir.exist?('samples')
|
37
|
+
end
|
38
|
+
|
39
|
+
def dummy_names
|
40
|
+
%i[name last_name nationality capital_city phone_number bank iban currency industry]
|
41
|
+
end
|
42
|
+
|
43
|
+
def dummy_data
|
44
|
+
[Faker::Name.first_name, Faker::Name.last_name, Faker::Nation.nationality, Faker::Nation.capital_city,
|
45
|
+
Faker::PhoneNumber.phone_number_with_country_code, Faker::Bank.name, Faker::Bank.iban, Faker::Currency.code, Faker::IndustrySegments.industry]
|
46
|
+
rescue Faker::UniqueGenerator::RetryLimitExceeded
|
47
|
+
Faker::UniqueGenerator.clear
|
48
|
+
end
|
49
|
+
|
50
|
+
def json_string
|
51
|
+
JSON.dump(Hash[dummy_names.zip(dummy_data)])
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
data/mini_etl.gemspec
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
lib = File.expand_path('lib', __dir__)
|
4
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
5
|
+
require 'mini_etl/version'
|
6
|
+
|
7
|
+
Gem::Specification.new do |spec|
|
8
|
+
spec.name = 'mini_etl'
|
9
|
+
spec.version = MiniEtl::VERSION
|
10
|
+
spec.authors = ['Gerardo Galindez']
|
11
|
+
spec.email = ['ggalindezb@gmail.com']
|
12
|
+
|
13
|
+
spec.summary = 'Extract/Transform/Load wrapper'
|
14
|
+
spec.homepage = 'https://github.com/ggalindezb/mini_etl'
|
15
|
+
|
16
|
+
# Prevent pushing this gem to RubyGems.org. To allow pushes either set the 'allowed_push_host'
|
17
|
+
# to allow pushing to a single host or delete this section to allow pushing to any host.
|
18
|
+
# if spec.respond_to?(:metadata)
|
19
|
+
# spec.metadata["allowed_push_host"] = "TODO: Set to 'http://mygemserver.com'"
|
20
|
+
|
21
|
+
# spec.metadata["homepage_uri"] = spec.homepage
|
22
|
+
# spec.metadata["source_code_uri"] = "TODO: Put your gem's public repo URL here."
|
23
|
+
# spec.metadata["changelog_uri"] = "TODO: Put your gem's CHANGELOG.md URL here."
|
24
|
+
# else
|
25
|
+
# raise "RubyGems 2.0 or newer is required to protect against " \
|
26
|
+
# "public gem pushes."
|
27
|
+
# end
|
28
|
+
|
29
|
+
# Specify which files should be added to the gem when it is released.
|
30
|
+
# The `git ls-files -z` loads the files in the RubyGem that have been added into git.
|
31
|
+
spec.files = Dir.chdir(File.expand_path(__dir__)) do
|
32
|
+
`git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
|
33
|
+
end
|
34
|
+
spec.bindir = 'exe'
|
35
|
+
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
36
|
+
spec.require_paths = ['lib']
|
37
|
+
|
38
|
+
spec.add_development_dependency 'bundler', '~> 1.16'
|
39
|
+
spec.add_development_dependency 'faker', '~> 1.9'
|
40
|
+
spec.add_development_dependency 'pry', '~> 0.12.2'
|
41
|
+
spec.add_development_dependency 'rake', '~> 10.0'
|
42
|
+
spec.add_development_dependency 'rspec', '~> 3.0'
|
43
|
+
spec.add_development_dependency 'rubocop', '~> 0.71.0'
|
44
|
+
spec.add_development_dependency 'simplecov', '~> 0.16.1'
|
45
|
+
end
|
metadata
ADDED
@@ -0,0 +1,160 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: mini_etl
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.2.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Gerardo Galindez
|
8
|
+
autorequire:
|
9
|
+
bindir: exe
|
10
|
+
cert_chain: []
|
11
|
+
date: 2019-06-07 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: bundler
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.16'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.16'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: faker
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '1.9'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '1.9'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: pry
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: 0.12.2
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: 0.12.2
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: rake
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '10.0'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '10.0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: rspec
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '3.0'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '3.0'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: rubocop
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - "~>"
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: 0.71.0
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - "~>"
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: 0.71.0
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: simplecov
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - "~>"
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: 0.16.1
|
104
|
+
type: :development
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - "~>"
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: 0.16.1
|
111
|
+
description:
|
112
|
+
email:
|
113
|
+
- ggalindezb@gmail.com
|
114
|
+
executables: []
|
115
|
+
extensions: []
|
116
|
+
extra_rdoc_files: []
|
117
|
+
files:
|
118
|
+
- ".gitignore"
|
119
|
+
- ".rubocop.yml"
|
120
|
+
- ".rubocop_todo.yml"
|
121
|
+
- CHANGELOG.md
|
122
|
+
- Gemfile
|
123
|
+
- Gemfile.lock
|
124
|
+
- README.md
|
125
|
+
- Rakefile
|
126
|
+
- bin/console
|
127
|
+
- bin/setup
|
128
|
+
- lib/mini_etl.rb
|
129
|
+
- lib/mini_etl/generator.rb
|
130
|
+
- lib/mini_etl/process.rb
|
131
|
+
- lib/mini_etl/source.rb
|
132
|
+
- lib/mini_etl/strategies/csv_strategy.rb
|
133
|
+
- lib/mini_etl/strategy.rb
|
134
|
+
- lib/mini_etl/util/status.rb
|
135
|
+
- lib/mini_etl/version.rb
|
136
|
+
- lib/tasks/support/generation.rb
|
137
|
+
- mini_etl.gemspec
|
138
|
+
homepage: https://github.com/ggalindezb/mini_etl
|
139
|
+
licenses: []
|
140
|
+
metadata: {}
|
141
|
+
post_install_message:
|
142
|
+
rdoc_options: []
|
143
|
+
require_paths:
|
144
|
+
- lib
|
145
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
146
|
+
requirements:
|
147
|
+
- - ">="
|
148
|
+
- !ruby/object:Gem::Version
|
149
|
+
version: '0'
|
150
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
151
|
+
requirements:
|
152
|
+
- - ">="
|
153
|
+
- !ruby/object:Gem::Version
|
154
|
+
version: '0'
|
155
|
+
requirements: []
|
156
|
+
rubygems_version: 3.0.3
|
157
|
+
signing_key:
|
158
|
+
specification_version: 4
|
159
|
+
summary: Extract/Transform/Load wrapper
|
160
|
+
test_files: []
|