remi 0.3.0 → 0.3.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.bundle/config +1 -1
- data/Gemfile +1 -0
- data/Gemfile.lock +45 -5
- data/README.md +245 -0
- data/features/step_definitions/remi_step.rb +16 -0
- data/jobs/sub_job_example_job.rb +5 -5
- data/lib/remi.rb +4 -1
- data/lib/remi/data_subject.rb +10 -1
- data/lib/remi/data_subjects/file_system.rb +31 -1
- data/lib/remi/data_subjects/gsheet.rb +140 -0
- data/lib/remi/data_subjects/sftp_file.rb +1 -0
- data/lib/remi/data_subjects/sub_job.rb +13 -4
- data/lib/remi/encoder.rb +1 -1
- data/lib/remi/job.rb +9 -1
- data/lib/remi/job/parameters.rb +8 -3
- data/lib/remi/job/sub_job.rb +14 -8
- data/lib/remi/loader.rb +14 -2
- data/lib/remi/testing/business_rules.rb +12 -9
- data/lib/remi/transform.rb +9 -0
- data/lib/remi/version.rb +1 -1
- data/spec/data_subject_spec.rb +23 -5
- data/spec/data_subjects/file_system_spec.rb +43 -9
- data/spec/data_subjects/gsheet_spec.rb +133 -0
- data/spec/data_subjects/sub_job_spec.rb +40 -8
- data/spec/job_spec.rb +58 -15
- metadata +5 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ea2d2971479e9e0dfcc5de4dd01ac13f5274a6f8
|
4
|
+
data.tar.gz: ea5e3a3280613d00ae29f5265c342740973ea57c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a09f8f926a99891356bcd6363ef4729b3a27bcf9cbfacce2a6bc8a8b60a262cfe83fe40d1e0f1f0c79945693aacb0fb1099b5bad299e022511846730e98642fe
|
7
|
+
data.tar.gz: d160e9840162558b1d9e203825f8e724ce1626e9f8a6922b86f894249b78c5c6680b37fef7e24574d794aac41db025bd887c707df35f6485a3540914552a3293
|
data/.bundle/config
CHANGED
@@ -1,2 +1,2 @@
|
|
1
1
|
---
|
2
|
-
BUNDLE_DISABLE_SHARED_GEMS:
|
2
|
+
BUNDLE_DISABLE_SHARED_GEMS: "true"
|
data/Gemfile
CHANGED
@@ -2,6 +2,7 @@
|
|
2
2
|
source 'https://rubygems.org'
|
3
3
|
|
4
4
|
gemspec
|
5
|
+
gem 'google-api-client', '~> 0.9'
|
5
6
|
gem 'daru', '0.1.4.1', git: 'git@github.com:inside-track/daru.git', branch: '0.1.4.1-Remi'
|
6
7
|
gem 'restforce', '~> 2.1'
|
7
8
|
gem 'salesforce_bulk_api', git: 'git@github.com:inside-track/salesforce_bulk_api.git', branch: 'master'
|
data/Gemfile.lock
CHANGED
@@ -18,7 +18,7 @@ GIT
|
|
18
18
|
PATH
|
19
19
|
remote: .
|
20
20
|
specs:
|
21
|
-
remi (0.3.
|
21
|
+
remi (0.3.1)
|
22
22
|
activesupport (~> 4.2)
|
23
23
|
bond (~> 0.5)
|
24
24
|
cucumber (~> 2.1)
|
@@ -39,6 +39,7 @@ GEM
|
|
39
39
|
minitest (~> 5.1)
|
40
40
|
thread_safe (~> 0.3, >= 0.3.4)
|
41
41
|
tzinfo (~> 1.1)
|
42
|
+
addressable (2.4.0)
|
42
43
|
aws-sdk (2.3.5)
|
43
44
|
aws-sdk-resources (= 2.3.5)
|
44
45
|
aws-sdk-core (2.3.5)
|
@@ -69,7 +70,26 @@ GEM
|
|
69
70
|
faraday (>= 0.7.4, < 0.10)
|
70
71
|
gherkin (3.2.0)
|
71
72
|
github-markup (1.4.0)
|
73
|
+
google-api-client (0.9.15)
|
74
|
+
addressable (~> 2.3)
|
75
|
+
googleauth (~> 0.5)
|
76
|
+
httpclient (~> 2.7)
|
77
|
+
hurley (~> 0.1)
|
78
|
+
memoist (~> 0.11)
|
79
|
+
mime-types (>= 1.6)
|
80
|
+
representable (~> 2.3.0)
|
81
|
+
retriable (~> 2.0)
|
82
|
+
googleauth (0.5.1)
|
83
|
+
faraday (~> 0.9)
|
84
|
+
jwt (~> 1.4)
|
85
|
+
logging (~> 2.0)
|
86
|
+
memoist (~> 0.12)
|
87
|
+
multi_json (~> 1.11)
|
88
|
+
os (~> 0.9)
|
89
|
+
signet (~> 0.7)
|
72
90
|
hashie (3.4.3)
|
91
|
+
httpclient (2.8.2.4)
|
92
|
+
hurley (0.2)
|
73
93
|
i18n (0.7.0)
|
74
94
|
iruby (0.2.7)
|
75
95
|
bond (~> 0.5)
|
@@ -80,6 +100,15 @@ GEM
|
|
80
100
|
json_pure (>= 1.8.1)
|
81
101
|
json (1.8.3)
|
82
102
|
json_pure (1.8.3)
|
103
|
+
jwt (1.5.6)
|
104
|
+
little-plugger (1.1.4)
|
105
|
+
logging (2.1.0)
|
106
|
+
little-plugger (~> 1.1)
|
107
|
+
multi_json (~> 1.10)
|
108
|
+
memoist (0.15.0)
|
109
|
+
mime-types (3.1)
|
110
|
+
mime-types-data (~> 3.2015)
|
111
|
+
mime-types-data (3.2016.0521)
|
83
112
|
mimemagic (0.3.1)
|
84
113
|
minitest (5.8.4)
|
85
114
|
multi_json (1.11.2)
|
@@ -88,16 +117,20 @@ GEM
|
|
88
117
|
net-sftp (2.1.2)
|
89
118
|
net-ssh (>= 2.6.5)
|
90
119
|
net-ssh (3.1.1)
|
120
|
+
os (0.9.6)
|
91
121
|
pg (0.18.4)
|
92
122
|
rbczmq (1.7.9)
|
93
123
|
redcarpet (3.3.4)
|
94
124
|
regex_sieve (0.1.0)
|
95
125
|
regexp-examples (1.2.0)
|
126
|
+
representable (2.3.0)
|
127
|
+
uber (~> 0.0.7)
|
96
128
|
restforce (2.2.0)
|
97
129
|
faraday (~> 0.9.0)
|
98
130
|
faraday_middleware (>= 0.8.8)
|
99
131
|
hashie (>= 1.2.0, < 4.0)
|
100
132
|
json (>= 1.7.5, < 1.9.0)
|
133
|
+
retriable (2.1.0)
|
101
134
|
rspec (3.4.0)
|
102
135
|
rspec-core (~> 3.4.0)
|
103
136
|
rspec-expectations (~> 3.4.0)
|
@@ -111,9 +144,15 @@ GEM
|
|
111
144
|
diff-lcs (>= 1.2.0, < 2.0)
|
112
145
|
rspec-support (~> 3.4.0)
|
113
146
|
rspec-support (3.4.1)
|
147
|
+
signet (0.7.3)
|
148
|
+
addressable (~> 2.3)
|
149
|
+
faraday (~> 0.9)
|
150
|
+
jwt (~> 1.5)
|
151
|
+
multi_json (~> 1.10)
|
114
152
|
thread_safe (0.3.5)
|
115
153
|
tzinfo (1.2.2)
|
116
154
|
thread_safe (~> 0.1)
|
155
|
+
uber (0.0.15)
|
117
156
|
xml-simple (1.1.5)
|
118
157
|
yard (0.9.0)
|
119
158
|
|
@@ -123,13 +162,14 @@ PLATFORMS
|
|
123
162
|
DEPENDENCIES
|
124
163
|
aws-sdk (~> 2.3)
|
125
164
|
daru (= 0.1.4.1)!
|
126
|
-
github-markup
|
165
|
+
github-markup (~> 1.4)
|
166
|
+
google-api-client (~> 0.9)
|
127
167
|
iruby (= 0.2.7)
|
128
|
-
redcarpet
|
168
|
+
redcarpet (~> 3.3)
|
129
169
|
remi!
|
130
170
|
restforce (~> 2.1)
|
131
171
|
salesforce_bulk_api!
|
132
|
-
yard
|
172
|
+
yard (~> 0.9)
|
133
173
|
|
134
174
|
BUNDLED WITH
|
135
|
-
1.
|
175
|
+
1.13.5
|
data/README.md
CHANGED
@@ -24,6 +24,11 @@ Remi will follow [semantic versioning](http://semver.org/) principles.
|
|
24
24
|
Of course, while we're still on major version zero, little effort will
|
25
25
|
be made to maintain backward compatibility.
|
26
26
|
|
27
|
+
The data transformation layer is built on top of
|
28
|
+
[Daru dataframe](https://github.com/v0dro/daru). Familiarity with
|
29
|
+
Daru dataframes is essential for writing complex transformations in
|
30
|
+
Remi.
|
31
|
+
|
27
32
|
## Getting Started
|
28
33
|
|
29
34
|
Add the gem to your Gemfile, `bundle install`, and then initialize your repository as
|
@@ -40,8 +45,248 @@ sure this works by running
|
|
40
45
|
|
41
46
|
All of the test should pass.
|
42
47
|
|
48
|
+
## Remi Jobs
|
49
|
+
|
50
|
+
A Remi job describes the data sources that will be used to collect
|
51
|
+
data, the transformations that will be performed on the data, and the
|
52
|
+
data targets that will be populated when all transformations are
|
53
|
+
complete. With Remi, an ETL process is defined in a class that
|
54
|
+
inherits from the `Remi::Job` class.
|
55
|
+
|
56
|
+
### Hello World
|
57
|
+
|
58
|
+
A very simple "Hello World" example of a Remi job would be
|
59
|
+
|
60
|
+
````ruby
|
61
|
+
class HelloWorldJob < Remi::Job
|
62
|
+
transform :say_hi do
|
63
|
+
puts "Hello World"
|
64
|
+
end
|
65
|
+
end
|
66
|
+
````
|
67
|
+
|
68
|
+
This job doesn't make use of any data subjects (data sources or data
|
69
|
+
targets), but it does define a single data transform called `:say_hi`.
|
70
|
+
The full job can be executed by calling the `#execute` method on an
|
71
|
+
instance of the `HelloWorldJob` class
|
72
|
+
|
73
|
+
````ruby
|
74
|
+
job = HelloWorldJob.new
|
75
|
+
job.execute
|
76
|
+
#=> "Hello World"
|
77
|
+
````
|
78
|
+
|
79
|
+
The transform called `say_hi` is just a method of the `HelloWorldJob`
|
80
|
+
class representing a job transform object. Multiple transforms can be
|
81
|
+
defined in a Remi job. To execute a specific transform we can call that transform by
|
82
|
+
name using
|
83
|
+
|
84
|
+
````ruby
|
85
|
+
job = HelloWorldJob.new
|
86
|
+
job.say_hi.execute
|
87
|
+
#=> "Hello World"
|
88
|
+
````
|
89
|
+
|
90
|
+
### A more complete example
|
91
|
+
|
92
|
+
Suppose we have a database containing data on beer sales. It's a
|
93
|
+
normalized database where we store data on individual beers sold in a
|
94
|
+
`beer_sales_fact` table and information on the details of the beer in
|
95
|
+
a `beers_dim` table. We'd like to extract data from both of these
|
96
|
+
sources, combine them into a single flattened table and save it as a
|
97
|
+
CSV file. This operation could be performed with the following Remi
|
98
|
+
job. (Of course, if this were a real world problem, we'd do the join
|
99
|
+
in the database before extracting; this is a contrived example to show
|
100
|
+
how one can combine data from multiple arbitrary sources).
|
101
|
+
|
102
|
+
|
103
|
+
````ruby
|
104
|
+
class DenormalizeBeersJob < Remi::Job
|
105
|
+
source :beer_sales_fact do
|
106
|
+
extractor Remi::Extractor::Postgres.new(
|
107
|
+
credentials: {
|
108
|
+
dbname: 'my_local_db'
|
109
|
+
},
|
110
|
+
query: 'SELECT beer_id, sold_date, quantity FROM beer_sales_fact'
|
111
|
+
)
|
112
|
+
parser Remi::Parser::Postgres.new
|
113
|
+
|
114
|
+
fields(
|
115
|
+
{
|
116
|
+
:beer_id => {},
|
117
|
+
:sold_at => { type: :date, in_format: '%Y-%m-%d' },
|
118
|
+
:quantity => { type: :integer }
|
119
|
+
}
|
120
|
+
)
|
121
|
+
end
|
122
|
+
|
123
|
+
source :beers_dim do
|
124
|
+
extractor Remi::Extractor::Postgres.new(
|
125
|
+
credentials: {
|
126
|
+
dbname: 'my_local_db'
|
127
|
+
},
|
128
|
+
query: 'SELECT beer_id, name, price_per_unit FROM beers_dim'
|
129
|
+
)
|
130
|
+
parser Remi::Parser::Postgres.new
|
131
|
+
|
132
|
+
fields(
|
133
|
+
{
|
134
|
+
:beer_id => {},
|
135
|
+
:name => {},
|
136
|
+
:price_per_unit => { type: :decimal, scale: 2 }
|
137
|
+
}
|
138
|
+
)
|
139
|
+
end
|
140
|
+
|
141
|
+
target :flat_beer_file do
|
142
|
+
encoder Remi::Encoder::CsvFile.new
|
143
|
+
loader Remi::Loader::LocalFile.new(
|
144
|
+
path: 'flat_beers.csv'
|
145
|
+
)
|
146
|
+
end
|
147
|
+
|
148
|
+
transform :type_enforcement do
|
149
|
+
beer_sales_fact.enforce_types
|
150
|
+
beers_dim.enforce_types
|
151
|
+
end
|
152
|
+
|
153
|
+
transform :flatten do
|
154
|
+
flat_beer_file.df = beer_sales_fact.df.join(flat_beer_file.df, on: [:beer_id], how: :inner)
|
155
|
+
|
156
|
+
Remi::SourceToTargetMap.apply(flat_beer_file.df) do
|
157
|
+
map source(:quantity, :price_per_unit) .target(:total_price)
|
158
|
+
.transform(->(row) {
|
159
|
+
row[:quantity] * row[:price_per_unit]
|
160
|
+
})
|
161
|
+
end
|
162
|
+
end
|
163
|
+
end
|
164
|
+
````
|
165
|
+
|
166
|
+
### Components of a Remi Job
|
167
|
+
|
168
|
+
A Remi job is composed of one or more of the following elements, which are described
|
169
|
+
in more detail below. All of these elements are defined using class methods (part
|
170
|
+
of `Remi::Job`). Each of the elements is given a name and defined in a block.
|
171
|
+
|
172
|
+
* Data Subjects - A data subject is either a data source or a data target.
|
173
|
+
* Data Sources - A data source describes where data is extracted from.
|
174
|
+
````ruby
|
175
|
+
source :my_source do
|
176
|
+
# ... source definition
|
177
|
+
end
|
178
|
+
````
|
179
|
+
* Data Targets - A data target describes where data is loaded to.
|
180
|
+
````ruby
|
181
|
+
target :my_target do
|
182
|
+
# ... target definition
|
183
|
+
end
|
184
|
+
````
|
185
|
+
|
186
|
+
* Transforms - A transform is essentially arbitrary block of of Ruby
|
187
|
+
code, but is typically used to transform data sources into data targets.
|
188
|
+
````ruby
|
189
|
+
transform :my_transform do
|
190
|
+
# ... lots of code
|
191
|
+
end
|
192
|
+
````
|
193
|
+
|
194
|
+
* Job Parameters - A job parameter is a memoized block of code
|
195
|
+
(similar to RSpecs' `let` method) that is used to configure a job and may
|
196
|
+
be overridden at runtime if needed.
|
197
|
+
````ruby
|
198
|
+
param :my_param do
|
199
|
+
# ... the return value of this block is memoized
|
200
|
+
end
|
201
|
+
````
|
202
|
+
|
203
|
+
* Sub Transforms - Sub transforms are essentially transforms, but they are NOT
|
204
|
+
automatically executed when the job is executed. Instead, they must be _imported_
|
205
|
+
in a transform. They are meant to be reusable bits of transform code.
|
206
|
+
````ruby
|
207
|
+
sub_transform :my_sub_transform do
|
208
|
+
#... sub_transform stuff
|
209
|
+
end
|
210
|
+
````
|
211
|
+
|
212
|
+
* Sub Jobs - Sub jobs are simply references to other Remi jobs that may be executed
|
213
|
+
within the current job.
|
214
|
+
````ruby
|
215
|
+
sub_job :my_sub_job { MySubJob.new }
|
216
|
+
````
|
217
|
+
|
218
|
+
|
219
|
+
|
220
|
+
### Execution Plan
|
221
|
+
|
222
|
+
The `DenormalizeBeersJob` example above can be executed using
|
223
|
+
|
224
|
+
````ruby
|
225
|
+
job = DenormalizeBeersJob.new
|
226
|
+
job.execute
|
227
|
+
````
|
228
|
+
|
229
|
+
Calling `#execute` on an instance of a job does the following, in this order:
|
230
|
+
1. All transforms defined in the job (via `transform :name do ... end`) are executed
|
231
|
+
in the order they were defined in the class definition.
|
232
|
+
2. All data targets are loaded in the order they are defined in the job.
|
233
|
+
|
234
|
+
Note that data sources are not extracted until the moment the data is
|
235
|
+
needed in a transform. If the source data is never referenced in a
|
236
|
+
transform, it is never extracted.
|
237
|
+
|
238
|
+
|
239
|
+
## Data Subjects
|
240
|
+
|
241
|
+
A _data subject_ refers to either a data source or a data target.
|
242
|
+
Either way, a data subject is associated with a data frame. Currently
|
243
|
+
the only data frames supported are
|
244
|
+
[Daru data frames](https://github.com/v0dro/daru), but support for
|
245
|
+
other data frames may be developed in the future. The data frame associated
|
246
|
+
with a data subject is accessed with the `#df` method and assigned with the `#df=`
|
247
|
+
method.
|
248
|
+
````ruby
|
249
|
+
my_data_subject.df #=> Daru::DataFrame
|
250
|
+
my_data_subject.df = Daru::DataFrame.new(...)
|
251
|
+
````
|
252
|
+
|
253
|
+
Additionally, all data subjects can be associated with a set of fields and field
|
254
|
+
metadata. Associating a data subject with feild data allows us to develop
|
255
|
+
generic ETL routines that triggered by arbitrary metadata that may be associated
|
256
|
+
with a field.
|
257
|
+
|
258
|
+
### Sources
|
259
|
+
|
260
|
+
### Targets
|
261
|
+
|
262
|
+
### Field Metadata
|
263
|
+
|
264
|
+
|
265
|
+
|
266
|
+
## Available Data Subjects
|
267
|
+
|
268
|
+
* CSV Files
|
269
|
+
* DataFrames
|
270
|
+
* None
|
271
|
+
* Local files
|
272
|
+
* SFTP Files
|
273
|
+
* S3 Files
|
274
|
+
* Salesforce
|
275
|
+
* Postgres
|
276
|
+
|
277
|
+
## Transforms
|
278
|
+
|
279
|
+
## Sub Jobs
|
280
|
+
|
281
|
+
## Job Parameters
|
282
|
+
|
283
|
+
## Sub Transforms
|
284
|
+
|
43
285
|
## Transforming Data
|
44
286
|
|
287
|
+
When `#execute` is called on an instance of a `Remi::Job`, all transforms are executed in
|
288
|
+
the order defined in the class
|
289
|
+
|
45
290
|
TODO:
|
46
291
|
|
47
292
|
Describe Daru foundation
|
@@ -61,6 +61,7 @@ end
|
|
61
61
|
|
62
62
|
Then /^the file that comes last in an alphanumeric sort by group will be downloaded for processing$/ do
|
63
63
|
expect(@brt.source.data_subject.extractors.map(&:most_recent_by)).to include :name
|
64
|
+
expect(@brt.source.data_subject.extractors.map(&:most_recent_only)).not_to include false
|
64
65
|
end
|
65
66
|
|
66
67
|
Then /^the file is uploaded to the remote path "([^"]+)"$/ do |remote_path|
|
@@ -626,6 +627,21 @@ Then /^only the following fields should be present on the target:$/ do |table|
|
|
626
627
|
expect(@brt.target.data_subject.df.vectors.to_a).to match_array @brt.target.fields.field_names
|
627
628
|
end
|
628
629
|
|
630
|
+
Then /^only the following fields should be present on the targets:$/ do |table|
|
631
|
+
table.rows.each do |row|
|
632
|
+
field = row[0]
|
633
|
+
targets = row[1].split(',')
|
634
|
+
targets.each { |target| step "the target field '#{target}: #{field}'" }
|
635
|
+
end
|
636
|
+
|
637
|
+
@brt.run_transforms
|
638
|
+
@brt.targets.keys.each do |target|
|
639
|
+
expect(@brt.targets[target].data_subject.df.vectors.to_a).to match_array @brt.targets[target].fields.field_names
|
640
|
+
end
|
641
|
+
end
|
642
|
+
|
643
|
+
|
644
|
+
|
629
645
|
### Record-level expectations
|
630
646
|
|
631
647
|
Then /^the record from source '(.+)' should be (?i)(Retained|Rejected)(?-i)(?: without error|)$/ do |source_name, action|
|
data/jobs/sub_job_example_job.rb
CHANGED
@@ -73,14 +73,14 @@ class SubJobExampleJob < Remi::Job
|
|
73
73
|
target :zombified_beers
|
74
74
|
|
75
75
|
transform :zombification do
|
76
|
-
# Sub jobs
|
77
|
-
beers_job
|
76
|
+
# Sub jobs are executed when data from a sub job is requested
|
77
|
+
# Here, the sub job beers_job is executed
|
78
78
|
just_beers.df = beer_fridge.df
|
79
79
|
|
80
|
-
#
|
80
|
+
# Data is supplied to the sub job on assignment
|
81
81
|
beers_to_zombify.df = just_beers.df
|
82
|
-
|
83
|
-
zombify_job
|
82
|
+
|
83
|
+
# Here, the sub job zombify_job is executed using the data supplied to it above
|
84
84
|
zombified_beers.df = zombie_fridge.df
|
85
85
|
end
|
86
86
|
end
|