remi 0.3.0 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.bundle/config +1 -1
- data/Gemfile +1 -0
- data/Gemfile.lock +45 -5
- data/README.md +245 -0
- data/features/step_definitions/remi_step.rb +16 -0
- data/jobs/sub_job_example_job.rb +5 -5
- data/lib/remi.rb +4 -1
- data/lib/remi/data_subject.rb +10 -1
- data/lib/remi/data_subjects/file_system.rb +31 -1
- data/lib/remi/data_subjects/gsheet.rb +140 -0
- data/lib/remi/data_subjects/sftp_file.rb +1 -0
- data/lib/remi/data_subjects/sub_job.rb +13 -4
- data/lib/remi/encoder.rb +1 -1
- data/lib/remi/job.rb +9 -1
- data/lib/remi/job/parameters.rb +8 -3
- data/lib/remi/job/sub_job.rb +14 -8
- data/lib/remi/loader.rb +14 -2
- data/lib/remi/testing/business_rules.rb +12 -9
- data/lib/remi/transform.rb +9 -0
- data/lib/remi/version.rb +1 -1
- data/spec/data_subject_spec.rb +23 -5
- data/spec/data_subjects/file_system_spec.rb +43 -9
- data/spec/data_subjects/gsheet_spec.rb +133 -0
- data/spec/data_subjects/sub_job_spec.rb +40 -8
- data/spec/job_spec.rb +58 -15
- metadata +5 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ea2d2971479e9e0dfcc5de4dd01ac13f5274a6f8
|
4
|
+
data.tar.gz: ea5e3a3280613d00ae29f5265c342740973ea57c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: a09f8f926a99891356bcd6363ef4729b3a27bcf9cbfacce2a6bc8a8b60a262cfe83fe40d1e0f1f0c79945693aacb0fb1099b5bad299e022511846730e98642fe
|
7
|
+
data.tar.gz: d160e9840162558b1d9e203825f8e724ce1626e9f8a6922b86f894249b78c5c6680b37fef7e24574d794aac41db025bd887c707df35f6485a3540914552a3293
|
data/.bundle/config
CHANGED
@@ -1,2 +1,2 @@
|
|
1
1
|
---
|
2
|
-
BUNDLE_DISABLE_SHARED_GEMS:
|
2
|
+
BUNDLE_DISABLE_SHARED_GEMS: "true"
|
data/Gemfile
CHANGED
@@ -2,6 +2,7 @@
|
|
2
2
|
source 'https://rubygems.org'
|
3
3
|
|
4
4
|
gemspec
|
5
|
+
gem 'google-api-client', '~> 0.9'
|
5
6
|
gem 'daru', '0.1.4.1', git: 'git@github.com:inside-track/daru.git', branch: '0.1.4.1-Remi'
|
6
7
|
gem 'restforce', '~> 2.1'
|
7
8
|
gem 'salesforce_bulk_api', git: 'git@github.com:inside-track/salesforce_bulk_api.git', branch: 'master'
|
data/Gemfile.lock
CHANGED
@@ -18,7 +18,7 @@ GIT
|
|
18
18
|
PATH
|
19
19
|
remote: .
|
20
20
|
specs:
|
21
|
-
remi (0.3.
|
21
|
+
remi (0.3.1)
|
22
22
|
activesupport (~> 4.2)
|
23
23
|
bond (~> 0.5)
|
24
24
|
cucumber (~> 2.1)
|
@@ -39,6 +39,7 @@ GEM
|
|
39
39
|
minitest (~> 5.1)
|
40
40
|
thread_safe (~> 0.3, >= 0.3.4)
|
41
41
|
tzinfo (~> 1.1)
|
42
|
+
addressable (2.4.0)
|
42
43
|
aws-sdk (2.3.5)
|
43
44
|
aws-sdk-resources (= 2.3.5)
|
44
45
|
aws-sdk-core (2.3.5)
|
@@ -69,7 +70,26 @@ GEM
|
|
69
70
|
faraday (>= 0.7.4, < 0.10)
|
70
71
|
gherkin (3.2.0)
|
71
72
|
github-markup (1.4.0)
|
73
|
+
google-api-client (0.9.15)
|
74
|
+
addressable (~> 2.3)
|
75
|
+
googleauth (~> 0.5)
|
76
|
+
httpclient (~> 2.7)
|
77
|
+
hurley (~> 0.1)
|
78
|
+
memoist (~> 0.11)
|
79
|
+
mime-types (>= 1.6)
|
80
|
+
representable (~> 2.3.0)
|
81
|
+
retriable (~> 2.0)
|
82
|
+
googleauth (0.5.1)
|
83
|
+
faraday (~> 0.9)
|
84
|
+
jwt (~> 1.4)
|
85
|
+
logging (~> 2.0)
|
86
|
+
memoist (~> 0.12)
|
87
|
+
multi_json (~> 1.11)
|
88
|
+
os (~> 0.9)
|
89
|
+
signet (~> 0.7)
|
72
90
|
hashie (3.4.3)
|
91
|
+
httpclient (2.8.2.4)
|
92
|
+
hurley (0.2)
|
73
93
|
i18n (0.7.0)
|
74
94
|
iruby (0.2.7)
|
75
95
|
bond (~> 0.5)
|
@@ -80,6 +100,15 @@ GEM
|
|
80
100
|
json_pure (>= 1.8.1)
|
81
101
|
json (1.8.3)
|
82
102
|
json_pure (1.8.3)
|
103
|
+
jwt (1.5.6)
|
104
|
+
little-plugger (1.1.4)
|
105
|
+
logging (2.1.0)
|
106
|
+
little-plugger (~> 1.1)
|
107
|
+
multi_json (~> 1.10)
|
108
|
+
memoist (0.15.0)
|
109
|
+
mime-types (3.1)
|
110
|
+
mime-types-data (~> 3.2015)
|
111
|
+
mime-types-data (3.2016.0521)
|
83
112
|
mimemagic (0.3.1)
|
84
113
|
minitest (5.8.4)
|
85
114
|
multi_json (1.11.2)
|
@@ -88,16 +117,20 @@ GEM
|
|
88
117
|
net-sftp (2.1.2)
|
89
118
|
net-ssh (>= 2.6.5)
|
90
119
|
net-ssh (3.1.1)
|
120
|
+
os (0.9.6)
|
91
121
|
pg (0.18.4)
|
92
122
|
rbczmq (1.7.9)
|
93
123
|
redcarpet (3.3.4)
|
94
124
|
regex_sieve (0.1.0)
|
95
125
|
regexp-examples (1.2.0)
|
126
|
+
representable (2.3.0)
|
127
|
+
uber (~> 0.0.7)
|
96
128
|
restforce (2.2.0)
|
97
129
|
faraday (~> 0.9.0)
|
98
130
|
faraday_middleware (>= 0.8.8)
|
99
131
|
hashie (>= 1.2.0, < 4.0)
|
100
132
|
json (>= 1.7.5, < 1.9.0)
|
133
|
+
retriable (2.1.0)
|
101
134
|
rspec (3.4.0)
|
102
135
|
rspec-core (~> 3.4.0)
|
103
136
|
rspec-expectations (~> 3.4.0)
|
@@ -111,9 +144,15 @@ GEM
|
|
111
144
|
diff-lcs (>= 1.2.0, < 2.0)
|
112
145
|
rspec-support (~> 3.4.0)
|
113
146
|
rspec-support (3.4.1)
|
147
|
+
signet (0.7.3)
|
148
|
+
addressable (~> 2.3)
|
149
|
+
faraday (~> 0.9)
|
150
|
+
jwt (~> 1.5)
|
151
|
+
multi_json (~> 1.10)
|
114
152
|
thread_safe (0.3.5)
|
115
153
|
tzinfo (1.2.2)
|
116
154
|
thread_safe (~> 0.1)
|
155
|
+
uber (0.0.15)
|
117
156
|
xml-simple (1.1.5)
|
118
157
|
yard (0.9.0)
|
119
158
|
|
@@ -123,13 +162,14 @@ PLATFORMS
|
|
123
162
|
DEPENDENCIES
|
124
163
|
aws-sdk (~> 2.3)
|
125
164
|
daru (= 0.1.4.1)!
|
126
|
-
github-markup
|
165
|
+
github-markup (~> 1.4)
|
166
|
+
google-api-client (~> 0.9)
|
127
167
|
iruby (= 0.2.7)
|
128
|
-
redcarpet
|
168
|
+
redcarpet (~> 3.3)
|
129
169
|
remi!
|
130
170
|
restforce (~> 2.1)
|
131
171
|
salesforce_bulk_api!
|
132
|
-
yard
|
172
|
+
yard (~> 0.9)
|
133
173
|
|
134
174
|
BUNDLED WITH
|
135
|
-
1.
|
175
|
+
1.13.5
|
data/README.md
CHANGED
@@ -24,6 +24,11 @@ Remi will follow [semantic versioning](http://semver.org/) principles.
|
|
24
24
|
Of course, while we're still on major version zero, little effort will
|
25
25
|
be made to maintain backward compatibility.
|
26
26
|
|
27
|
+
The data transformation layer is built on top of
|
28
|
+
[Daru dataframe](https://github.com/v0dro/daru). Familiarity with
|
29
|
+
Daru dataframes is essential for writing complex transformations in
|
30
|
+
Remi.
|
31
|
+
|
27
32
|
## Getting Started
|
28
33
|
|
29
34
|
Add the gem to your Gemfile, `bundle install`, and then initialize your repository as
|
@@ -40,8 +45,248 @@ sure this works by running
|
|
40
45
|
|
41
46
|
All of the test should pass.
|
42
47
|
|
48
|
+
## Remi Jobs
|
49
|
+
|
50
|
+
A Remi job describes the data sources that will be used to collect
|
51
|
+
data, the transformations that will be performed on the data, and the
|
52
|
+
data targets that will be populated when all transformations are
|
53
|
+
complete. With Remi, an ETL process is defined in a class that
|
54
|
+
inherits from the `Remi::Job` class.
|
55
|
+
|
56
|
+
### Hello World
|
57
|
+
|
58
|
+
A very simple "Hello World" example of a Remi job would be
|
59
|
+
|
60
|
+
````ruby
|
61
|
+
class HelloWorldJob < Remi::Job
|
62
|
+
transform :say_hi do
|
63
|
+
puts "Hello World"
|
64
|
+
end
|
65
|
+
end
|
66
|
+
````
|
67
|
+
|
68
|
+
This job doesn't make use of any data subjects (data sources or data
|
69
|
+
targets), but it does define a single data transform called `:say_hi`.
|
70
|
+
The full job can be executed by calling the `#execute` method on an
|
71
|
+
instance of the `HelloWorldJob` class
|
72
|
+
|
73
|
+
````ruby
|
74
|
+
job = HelloWorldJob.new
|
75
|
+
job.execute
|
76
|
+
#=> "Hello World"
|
77
|
+
````
|
78
|
+
|
79
|
+
The transform called `say_hi` is just a method of the `HelloWorldJob`
|
80
|
+
class representing a job transform object. Multiple transforms can be
|
81
|
+
defined in a Remi job. To execute a specific transform we can call that transform by
|
82
|
+
name using
|
83
|
+
|
84
|
+
````ruby
|
85
|
+
job = HelloWorldJob.new
|
86
|
+
job.say_hi.execute
|
87
|
+
#=> "Hello World"
|
88
|
+
````
|
89
|
+
|
90
|
+
### A more complete example
|
91
|
+
|
92
|
+
Suppose we have a database containing data on beer sales. It's a
|
93
|
+
normalized database where we store data on individual beers sold in a
|
94
|
+
`beer_sales_fact` table and information on the details of the beer in
|
95
|
+
a `beers_dim` table. We'd like to extract data from both of these
|
96
|
+
sources, combine them into a single flattened table and save it as a
|
97
|
+
CSV file. This operation could be performed with the following Remi
|
98
|
+
job. (Of course, if this were a real world problem, we'd do the join
|
99
|
+
in the database before extracting; this is a contrived example to show
|
100
|
+
how one can combine data from multiple arbitrary sources).
|
101
|
+
|
102
|
+
|
103
|
+
````ruby
|
104
|
+
class DenormalizeBeersJob < Remi::Job
|
105
|
+
source :beer_sales_fact do
|
106
|
+
extractor Remi::Extractor::Postgres.new(
|
107
|
+
credentials: {
|
108
|
+
dbname: 'my_local_db'
|
109
|
+
},
|
110
|
+
query: 'SELECT beer_id, sold_date, quantity FROM beer_sales_fact'
|
111
|
+
)
|
112
|
+
parser Remi::Parser::Postgres.new
|
113
|
+
|
114
|
+
fields(
|
115
|
+
{
|
116
|
+
:beer_id => {},
|
117
|
+
:sold_at => { type: :date, in_format: '%Y-%m-%d' },
|
118
|
+
:quantity => { type: :integer }
|
119
|
+
}
|
120
|
+
)
|
121
|
+
end
|
122
|
+
|
123
|
+
source :beers_dim do
|
124
|
+
extractor Remi::Extractor::Postgres.new(
|
125
|
+
credentials: {
|
126
|
+
dbname: 'my_local_db'
|
127
|
+
},
|
128
|
+
query: 'SELECT beer_id, name, price_per_unit FROM beers_dim'
|
129
|
+
)
|
130
|
+
parser Remi::Parser::Postgres.new
|
131
|
+
|
132
|
+
fields(
|
133
|
+
{
|
134
|
+
:beer_id => {},
|
135
|
+
:name => {},
|
136
|
+
:price_per_unit => { type: :decimal, scale: 2 }
|
137
|
+
}
|
138
|
+
)
|
139
|
+
end
|
140
|
+
|
141
|
+
target :flat_beer_file do
|
142
|
+
encoder Remi::Encoder::CsvFile.new
|
143
|
+
loader Remi::Loader::LocalFile.new(
|
144
|
+
path: 'flat_beers.csv'
|
145
|
+
)
|
146
|
+
end
|
147
|
+
|
148
|
+
transform :type_enforcement do
|
149
|
+
beer_sales_fact.enforce_types
|
150
|
+
beers_dim.enforce_types
|
151
|
+
end
|
152
|
+
|
153
|
+
transform :flatten do
|
154
|
+
flat_beer_file.df = beer_sales_fact.df.join(flat_beer_file.df, on: [:beer_id], how: :inner)
|
155
|
+
|
156
|
+
Remi::SourceToTargetMap.apply(flat_beer_file.df) do
|
157
|
+
map source(:quantity, :price_per_unit) .target(:total_price)
|
158
|
+
.transform(->(row) {
|
159
|
+
row[:quantity] * row[:price_per_unit]
|
160
|
+
})
|
161
|
+
end
|
162
|
+
end
|
163
|
+
end
|
164
|
+
````
|
165
|
+
|
166
|
+
### Components of a Remi Job
|
167
|
+
|
168
|
+
A Remi job is composed of one or more of the following elements, which are described
|
169
|
+
in more detail below. All of these elements are defined using class methods (part
|
170
|
+
of `Remi::Job`). Each of the elements is given a name and defined in a block.
|
171
|
+
|
172
|
+
* Data Subjects - A data subject is either a data source or a data target.
|
173
|
+
* Data Sources - A data source describes where data is extracted from.
|
174
|
+
````ruby
|
175
|
+
source :my_source do
|
176
|
+
# ... source definition
|
177
|
+
end
|
178
|
+
````
|
179
|
+
* Data Targets - A data target describes where data is loaded to.
|
180
|
+
````ruby
|
181
|
+
target :my_target do
|
182
|
+
# ... target definition
|
183
|
+
end
|
184
|
+
````
|
185
|
+
|
186
|
+
* Transforms - A transform is essentially arbitrary block of of Ruby
|
187
|
+
code, but is typically used to transform data sources into data targets.
|
188
|
+
````ruby
|
189
|
+
transform :my_transform do
|
190
|
+
# ... lots of code
|
191
|
+
end
|
192
|
+
````
|
193
|
+
|
194
|
+
* Job Parameters - A job parameter is a memoized block of code
|
195
|
+
(similar to RSpecs' `let` method) that is used to configure a job and may
|
196
|
+
be overridden at runtime if needed.
|
197
|
+
````ruby
|
198
|
+
param :my_param do
|
199
|
+
# ... the return value of this block is memoized
|
200
|
+
end
|
201
|
+
````
|
202
|
+
|
203
|
+
* Sub Transforms - Sub transforms are essentially transforms, but they are NOT
|
204
|
+
automatically executed when the job is executed. Instead, they must be _imported_
|
205
|
+
in a transform. They are meant to be reusable bits of transform code.
|
206
|
+
````ruby
|
207
|
+
sub_transform :my_sub_transform do
|
208
|
+
#... sub_transform stuff
|
209
|
+
end
|
210
|
+
````
|
211
|
+
|
212
|
+
* Sub Jobs - Sub jobs are simply references to other Remi jobs that may be executed
|
213
|
+
within the current job.
|
214
|
+
````ruby
|
215
|
+
sub_job :my_sub_job { MySubJob.new }
|
216
|
+
````
|
217
|
+
|
218
|
+
|
219
|
+
|
220
|
+
### Execution Plan
|
221
|
+
|
222
|
+
The `DenormalizeBeersJob` example above can be executed using
|
223
|
+
|
224
|
+
````ruby
|
225
|
+
job = DenormalizeBeersJob.new
|
226
|
+
job.execute
|
227
|
+
````
|
228
|
+
|
229
|
+
Calling `#execute` on an instance of a job does the following, in this order:
|
230
|
+
1. All transforms defined in the job (via `transform :name do ... end`) are executed
|
231
|
+
in the order they were defined in the class definition.
|
232
|
+
2. All data targets are loaded in the order they are defined in the job.
|
233
|
+
|
234
|
+
Note that data sources are not extracted until the moment the data is
|
235
|
+
needed in a transform. If the source data is never referenced in a
|
236
|
+
transform, it is never extracted.
|
237
|
+
|
238
|
+
|
239
|
+
## Data Subjects
|
240
|
+
|
241
|
+
A _data subject_ refers to either a data source or a data target.
|
242
|
+
Either way, a data subject is associated with a data frame. Currently
|
243
|
+
the only data frames supported are
|
244
|
+
[Daru data frames](https://github.com/v0dro/daru), but support for
|
245
|
+
other data frames may be developed in the future. The data frame associated
|
246
|
+
with a data subject is accessed with the `#df` method and assigned with the `#df=`
|
247
|
+
method.
|
248
|
+
````ruby
|
249
|
+
my_data_subject.df #=> Daru::DataFrame
|
250
|
+
my_data_subject.df = Daru::DataFrame.new(...)
|
251
|
+
````
|
252
|
+
|
253
|
+
Additionally, all data subjects can be associated with a set of fields and field
|
254
|
+
metadata. Associating a data subject with feild data allows us to develop
|
255
|
+
generic ETL routines that triggered by arbitrary metadata that may be associated
|
256
|
+
with a field.
|
257
|
+
|
258
|
+
### Sources
|
259
|
+
|
260
|
+
### Targets
|
261
|
+
|
262
|
+
### Field Metadata
|
263
|
+
|
264
|
+
|
265
|
+
|
266
|
+
## Available Data Subjects
|
267
|
+
|
268
|
+
* CSV Files
|
269
|
+
* DataFrames
|
270
|
+
* None
|
271
|
+
* Local files
|
272
|
+
* SFTP Files
|
273
|
+
* S3 Files
|
274
|
+
* Salesforce
|
275
|
+
* Postgres
|
276
|
+
|
277
|
+
## Transforms
|
278
|
+
|
279
|
+
## Sub Jobs
|
280
|
+
|
281
|
+
## Job Parameters
|
282
|
+
|
283
|
+
## Sub Transforms
|
284
|
+
|
43
285
|
## Transforming Data
|
44
286
|
|
287
|
+
When `#execute` is called on an instance of a `Remi::Job`, all transforms are executed in
|
288
|
+
the order defined in the class
|
289
|
+
|
45
290
|
TODO:
|
46
291
|
|
47
292
|
Describe Daru foundation
|
@@ -61,6 +61,7 @@ end
|
|
61
61
|
|
62
62
|
Then /^the file that comes last in an alphanumeric sort by group will be downloaded for processing$/ do
|
63
63
|
expect(@brt.source.data_subject.extractors.map(&:most_recent_by)).to include :name
|
64
|
+
expect(@brt.source.data_subject.extractors.map(&:most_recent_only)).not_to include false
|
64
65
|
end
|
65
66
|
|
66
67
|
Then /^the file is uploaded to the remote path "([^"]+)"$/ do |remote_path|
|
@@ -626,6 +627,21 @@ Then /^only the following fields should be present on the target:$/ do |table|
|
|
626
627
|
expect(@brt.target.data_subject.df.vectors.to_a).to match_array @brt.target.fields.field_names
|
627
628
|
end
|
628
629
|
|
630
|
+
Then /^only the following fields should be present on the targets:$/ do |table|
|
631
|
+
table.rows.each do |row|
|
632
|
+
field = row[0]
|
633
|
+
targets = row[1].split(',')
|
634
|
+
targets.each { |target| step "the target field '#{target}: #{field}'" }
|
635
|
+
end
|
636
|
+
|
637
|
+
@brt.run_transforms
|
638
|
+
@brt.targets.keys.each do |target|
|
639
|
+
expect(@brt.targets[target].data_subject.df.vectors.to_a).to match_array @brt.targets[target].fields.field_names
|
640
|
+
end
|
641
|
+
end
|
642
|
+
|
643
|
+
|
644
|
+
|
629
645
|
### Record-level expectations
|
630
646
|
|
631
647
|
Then /^the record from source '(.+)' should be (?i)(Retained|Rejected)(?-i)(?: without error|)$/ do |source_name, action|
|
data/jobs/sub_job_example_job.rb
CHANGED
@@ -73,14 +73,14 @@ class SubJobExampleJob < Remi::Job
|
|
73
73
|
target :zombified_beers
|
74
74
|
|
75
75
|
transform :zombification do
|
76
|
-
# Sub jobs
|
77
|
-
beers_job
|
76
|
+
# Sub jobs are executed when data from a sub job is requested
|
77
|
+
# Here, the sub job beers_job is executed
|
78
78
|
just_beers.df = beer_fridge.df
|
79
79
|
|
80
|
-
#
|
80
|
+
# Data is supplied to the sub job on assignment
|
81
81
|
beers_to_zombify.df = just_beers.df
|
82
|
-
|
83
|
-
zombify_job
|
82
|
+
|
83
|
+
# Here, the sub job zombify_job is executed using the data supplied to it above
|
84
84
|
zombified_beers.df = zombie_fridge.df
|
85
85
|
end
|
86
86
|
end
|