remi 0.3.0 → 0.3.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 71164673ece850e218a1ef18a72aa02f4ca5d065
4
- data.tar.gz: b52e6e86bc4995f2caf79dad79a7f0a195cdb9f9
3
+ metadata.gz: ea2d2971479e9e0dfcc5de4dd01ac13f5274a6f8
4
+ data.tar.gz: ea5e3a3280613d00ae29f5265c342740973ea57c
5
5
  SHA512:
6
- metadata.gz: 708b6fdf566acb525caa85bc038685bad46c3999f65045b5281c9393d731f5a3cdb49ac30a7ecde12f16abab5a27364449a83fc840adfa531e2a04a783fcd69a
7
- data.tar.gz: 48730dd300bb84dadb93d911e37baa274958d0d1deaddcbc769aa43a7f7c1887ffbcf025ce43d4188e09630de31868acc6315ae2559dbd80853febc42e13b5d7
6
+ metadata.gz: a09f8f926a99891356bcd6363ef4729b3a27bcf9cbfacce2a6bc8a8b60a262cfe83fe40d1e0f1f0c79945693aacb0fb1099b5bad299e022511846730e98642fe
7
+ data.tar.gz: d160e9840162558b1d9e203825f8e724ce1626e9f8a6922b86f894249b78c5c6680b37fef7e24574d794aac41db025bd887c707df35f6485a3540914552a3293
data/.bundle/config CHANGED
@@ -1,2 +1,2 @@
1
1
  ---
2
- BUNDLE_DISABLE_SHARED_GEMS: '1'
2
+ BUNDLE_DISABLE_SHARED_GEMS: "true"
data/Gemfile CHANGED
@@ -2,6 +2,7 @@
2
2
  source 'https://rubygems.org'
3
3
 
4
4
  gemspec
5
+ gem 'google-api-client', '~> 0.9'
5
6
  gem 'daru', '0.1.4.1', git: 'git@github.com:inside-track/daru.git', branch: '0.1.4.1-Remi'
6
7
  gem 'restforce', '~> 2.1'
7
8
  gem 'salesforce_bulk_api', git: 'git@github.com:inside-track/salesforce_bulk_api.git', branch: 'master'
data/Gemfile.lock CHANGED
@@ -18,7 +18,7 @@ GIT
18
18
  PATH
19
19
  remote: .
20
20
  specs:
21
- remi (0.3.0)
21
+ remi (0.3.1)
22
22
  activesupport (~> 4.2)
23
23
  bond (~> 0.5)
24
24
  cucumber (~> 2.1)
@@ -39,6 +39,7 @@ GEM
39
39
  minitest (~> 5.1)
40
40
  thread_safe (~> 0.3, >= 0.3.4)
41
41
  tzinfo (~> 1.1)
42
+ addressable (2.4.0)
42
43
  aws-sdk (2.3.5)
43
44
  aws-sdk-resources (= 2.3.5)
44
45
  aws-sdk-core (2.3.5)
@@ -69,7 +70,26 @@ GEM
69
70
  faraday (>= 0.7.4, < 0.10)
70
71
  gherkin (3.2.0)
71
72
  github-markup (1.4.0)
73
+ google-api-client (0.9.15)
74
+ addressable (~> 2.3)
75
+ googleauth (~> 0.5)
76
+ httpclient (~> 2.7)
77
+ hurley (~> 0.1)
78
+ memoist (~> 0.11)
79
+ mime-types (>= 1.6)
80
+ representable (~> 2.3.0)
81
+ retriable (~> 2.0)
82
+ googleauth (0.5.1)
83
+ faraday (~> 0.9)
84
+ jwt (~> 1.4)
85
+ logging (~> 2.0)
86
+ memoist (~> 0.12)
87
+ multi_json (~> 1.11)
88
+ os (~> 0.9)
89
+ signet (~> 0.7)
72
90
  hashie (3.4.3)
91
+ httpclient (2.8.2.4)
92
+ hurley (0.2)
73
93
  i18n (0.7.0)
74
94
  iruby (0.2.7)
75
95
  bond (~> 0.5)
@@ -80,6 +100,15 @@ GEM
80
100
  json_pure (>= 1.8.1)
81
101
  json (1.8.3)
82
102
  json_pure (1.8.3)
103
+ jwt (1.5.6)
104
+ little-plugger (1.1.4)
105
+ logging (2.1.0)
106
+ little-plugger (~> 1.1)
107
+ multi_json (~> 1.10)
108
+ memoist (0.15.0)
109
+ mime-types (3.1)
110
+ mime-types-data (~> 3.2015)
111
+ mime-types-data (3.2016.0521)
83
112
  mimemagic (0.3.1)
84
113
  minitest (5.8.4)
85
114
  multi_json (1.11.2)
@@ -88,16 +117,20 @@ GEM
88
117
  net-sftp (2.1.2)
89
118
  net-ssh (>= 2.6.5)
90
119
  net-ssh (3.1.1)
120
+ os (0.9.6)
91
121
  pg (0.18.4)
92
122
  rbczmq (1.7.9)
93
123
  redcarpet (3.3.4)
94
124
  regex_sieve (0.1.0)
95
125
  regexp-examples (1.2.0)
126
+ representable (2.3.0)
127
+ uber (~> 0.0.7)
96
128
  restforce (2.2.0)
97
129
  faraday (~> 0.9.0)
98
130
  faraday_middleware (>= 0.8.8)
99
131
  hashie (>= 1.2.0, < 4.0)
100
132
  json (>= 1.7.5, < 1.9.0)
133
+ retriable (2.1.0)
101
134
  rspec (3.4.0)
102
135
  rspec-core (~> 3.4.0)
103
136
  rspec-expectations (~> 3.4.0)
@@ -111,9 +144,15 @@ GEM
111
144
  diff-lcs (>= 1.2.0, < 2.0)
112
145
  rspec-support (~> 3.4.0)
113
146
  rspec-support (3.4.1)
147
+ signet (0.7.3)
148
+ addressable (~> 2.3)
149
+ faraday (~> 0.9)
150
+ jwt (~> 1.5)
151
+ multi_json (~> 1.10)
114
152
  thread_safe (0.3.5)
115
153
  tzinfo (1.2.2)
116
154
  thread_safe (~> 0.1)
155
+ uber (0.0.15)
117
156
  xml-simple (1.1.5)
118
157
  yard (0.9.0)
119
158
 
@@ -123,13 +162,14 @@ PLATFORMS
123
162
  DEPENDENCIES
124
163
  aws-sdk (~> 2.3)
125
164
  daru (= 0.1.4.1)!
126
- github-markup
165
+ github-markup (~> 1.4)
166
+ google-api-client (~> 0.9)
127
167
  iruby (= 0.2.7)
128
- redcarpet
168
+ redcarpet (~> 3.3)
129
169
  remi!
130
170
  restforce (~> 2.1)
131
171
  salesforce_bulk_api!
132
- yard
172
+ yard (~> 0.9)
133
173
 
134
174
  BUNDLED WITH
135
- 1.11.2
175
+ 1.13.5
data/README.md CHANGED
@@ -24,6 +24,11 @@ Remi will follow [semantic versioning](http://semver.org/) principles.
24
24
  Of course, while we're still on major version zero, little effort will
25
25
  be made to maintain backward compatibility.
26
26
 
27
+ The data transformation layer is built on top of
28
+ [Daru dataframe](https://github.com/v0dro/daru). Familiarity with
29
+ Daru dataframes is essential for writing complex transformations in
30
+ Remi.
31
+
27
32
  ## Getting Started
28
33
 
29
34
  Add the gem to your Gemfile, `bundle install`, and then initialize your repository as
@@ -40,8 +45,248 @@ sure this works by running
40
45
 
41
46
  All of the test should pass.
42
47
 
48
+ ## Remi Jobs
49
+
50
+ A Remi job describes the data sources that will be used to collect
51
+ data, the transformations that will be performed on the data, and the
52
+ data targets that will be populated when all transformations are
53
+ complete. With Remi, an ETL process is defined in a class that
54
+ inherits from the `Remi::Job` class.
55
+
56
+ ### Hello World
57
+
58
+ A very simple "Hello World" example of a Remi job would be
59
+
60
+ ````ruby
61
+ class HelloWorldJob < Remi::Job
62
+ transform :say_hi do
63
+ puts "Hello World"
64
+ end
65
+ end
66
+ ````
67
+
68
+ This job doesn't make use of any data subjects (data sources or data
69
+ targets), but it does define a single data transform called `:say_hi`.
70
+ The full job can be executed by calling the `#execute` method on an
71
+ instance of the `HelloWorldJob` class
72
+
73
+ ````ruby
74
+ job = HelloWorldJob.new
75
+ job.execute
76
+ #=> "Hello World"
77
+ ````
78
+
79
+ The transform called `say_hi` is just a method of the `HelloWorldJob`
80
+ class representing a job transform object. Multiple transforms can be
81
+ defined in a Remi job. To execute a specific transform we can call that transform by
82
+ name using
83
+
84
+ ````ruby
85
+ job = HelloWorldJob.new
86
+ job.say_hi.execute
87
+ #=> "Hello World"
88
+ ````
89
+
90
+ ### A more complete example
91
+
92
+ Suppose we have a database containing data on beer sales. It's a
93
+ normalized database where we store data on individual beers sold in a
94
+ `beer_sales_fact` table and information on the details of the beer in
95
+ a `beers_dim` table. We'd like to extract data from both of these
96
+ sources, combine them into a single flattened table and save it as a
97
+ CSV file. This operation could be performed with the following Remi
98
+ job. (Of course, if this were a real world problem, we'd do the join
99
+ in the database before extracting; this is a contrived example to show
100
+ how one can combine data from multiple arbitrary sources).
101
+
102
+
103
+ ````ruby
104
+ class DenormalizeBeersJob < Remi::Job
105
+ source :beer_sales_fact do
106
+ extractor Remi::Extractor::Postgres.new(
107
+ credentials: {
108
+ dbname: 'my_local_db'
109
+ },
110
+ query: 'SELECT beer_id, sold_date, quantity FROM beer_sales_fact'
111
+ )
112
+ parser Remi::Parser::Postgres.new
113
+
114
+ fields(
115
+ {
116
+ :beer_id => {},
117
+ :sold_at => { type: :date, in_format: '%Y-%m-%d' },
118
+ :quantity => { type: :integer }
119
+ }
120
+ )
121
+ end
122
+
123
+ source :beers_dim do
124
+ extractor Remi::Extractor::Postgres.new(
125
+ credentials: {
126
+ dbname: 'my_local_db'
127
+ },
128
+ query: 'SELECT beer_id, name, price_per_unit FROM beers_dim'
129
+ )
130
+ parser Remi::Parser::Postgres.new
131
+
132
+ fields(
133
+ {
134
+ :beer_id => {},
135
+ :name => {},
136
+ :price_per_unit => { type: :decimal, scale: 2 }
137
+ }
138
+ )
139
+ end
140
+
141
+ target :flat_beer_file do
142
+ encoder Remi::Encoder::CsvFile.new
143
+ loader Remi::Loader::LocalFile.new(
144
+ path: 'flat_beers.csv'
145
+ )
146
+ end
147
+
148
+ transform :type_enforcement do
149
+ beer_sales_fact.enforce_types
150
+ beers_dim.enforce_types
151
+ end
152
+
153
+ transform :flatten do
154
+ flat_beer_file.df = beer_sales_fact.df.join(flat_beer_file.df, on: [:beer_id], how: :inner)
155
+
156
+ Remi::SourceToTargetMap.apply(flat_beer_file.df) do
157
+ map source(:quantity, :price_per_unit) .target(:total_price)
158
+ .transform(->(row) {
159
+ row[:quantity] * row[:price_per_unit]
160
+ })
161
+ end
162
+ end
163
+ end
164
+ ````
165
+
166
+ ### Components of a Remi Job
167
+
168
+ A Remi job is composed of one or more of the following elements, which are described
169
+ in more detail below. All of these elements are defined using class methods (part
170
+ of `Remi::Job`). Each of the elements is given a name and defined in a block.
171
+
172
+ * Data Subjects - A data subject is either a data source or a data target.
173
+ * Data Sources - A data source describes where data is extracted from.
174
+ ````ruby
175
+ source :my_source do
176
+ # ... source definition
177
+ end
178
+ ````
179
+ * Data Targets - A data target describes where data is loaded to.
180
+ ````ruby
181
+ target :my_target do
182
+ # ... target definition
183
+ end
184
+ ````
185
+
186
+ * Transforms - A transform is essentially arbitrary block of of Ruby
187
+ code, but is typically used to transform data sources into data targets.
188
+ ````ruby
189
+ transform :my_transform do
190
+ # ... lots of code
191
+ end
192
+ ````
193
+
194
+ * Job Parameters - A job parameter is a memoized block of code
195
+ (similar to RSpecs' `let` method) that is used to configure a job and may
196
+ be overridden at runtime if needed.
197
+ ````ruby
198
+ param :my_param do
199
+ # ... the return value of this block is memoized
200
+ end
201
+ ````
202
+
203
+ * Sub Transforms - Sub transforms are essentially transforms, but they are NOT
204
+ automatically executed when the job is executed. Instead, they must be _imported_
205
+ in a transform. They are meant to be reusable bits of transform code.
206
+ ````ruby
207
+ sub_transform :my_sub_transform do
208
+ #... sub_transform stuff
209
+ end
210
+ ````
211
+
212
+ * Sub Jobs - Sub jobs are simply references to other Remi jobs that may be executed
213
+ within the current job.
214
+ ````ruby
215
+ sub_job :my_sub_job { MySubJob.new }
216
+ ````
217
+
218
+
219
+
220
+ ### Execution Plan
221
+
222
+ The `DenormalizeBeersJob` example above can be executed using
223
+
224
+ ````ruby
225
+ job = DenormalizeBeersJob.new
226
+ job.execute
227
+ ````
228
+
229
+ Calling `#execute` on an instance of a job does the following, in this order:
230
+ 1. All transforms defined in the job (via `transform :name do ... end`) are executed
231
+ in the order they were defined in the class definition.
232
+ 2. All data targets are loaded in the order they are defined in the job.
233
+
234
+ Note that data sources are not extracted until the moment the data is
235
+ needed in a transform. If the source data is never referenced in a
236
+ transform, it is never extracted.
237
+
238
+
239
+ ## Data Subjects
240
+
241
+ A _data subject_ refers to either a data source or a data target.
242
+ Either way, a data subject is associated with a data frame. Currently
243
+ the only data frames supported are
244
+ [Daru data frames](https://github.com/v0dro/daru), but support for
245
+ other data frames may be developed in the future. The data frame associated
246
+ with a data subject is accessed with the `#df` method and assigned with the `#df=`
247
+ method.
248
+ ````ruby
249
+ my_data_subject.df #=> Daru::DataFrame
250
+ my_data_subject.df = Daru::DataFrame.new(...)
251
+ ````
252
+
253
+ Additionally, all data subjects can be associated with a set of fields and field
254
+ metadata. Associating a data subject with feild data allows us to develop
255
+ generic ETL routines that triggered by arbitrary metadata that may be associated
256
+ with a field.
257
+
258
+ ### Sources
259
+
260
+ ### Targets
261
+
262
+ ### Field Metadata
263
+
264
+
265
+
266
+ ## Available Data Subjects
267
+
268
+ * CSV Files
269
+ * DataFrames
270
+ * None
271
+ * Local files
272
+ * SFTP Files
273
+ * S3 Files
274
+ * Salesforce
275
+ * Postgres
276
+
277
+ ## Transforms
278
+
279
+ ## Sub Jobs
280
+
281
+ ## Job Parameters
282
+
283
+ ## Sub Transforms
284
+
43
285
  ## Transforming Data
44
286
 
287
+ When `#execute` is called on an instance of a `Remi::Job`, all transforms are executed in
288
+ the order defined in the class
289
+
45
290
  TODO:
46
291
 
47
292
  Describe Daru foundation
@@ -61,6 +61,7 @@ end
61
61
 
62
62
  Then /^the file that comes last in an alphanumeric sort by group will be downloaded for processing$/ do
63
63
  expect(@brt.source.data_subject.extractors.map(&:most_recent_by)).to include :name
64
+ expect(@brt.source.data_subject.extractors.map(&:most_recent_only)).not_to include false
64
65
  end
65
66
 
66
67
  Then /^the file is uploaded to the remote path "([^"]+)"$/ do |remote_path|
@@ -626,6 +627,21 @@ Then /^only the following fields should be present on the target:$/ do |table|
626
627
  expect(@brt.target.data_subject.df.vectors.to_a).to match_array @brt.target.fields.field_names
627
628
  end
628
629
 
630
+ Then /^only the following fields should be present on the targets:$/ do |table|
631
+ table.rows.each do |row|
632
+ field = row[0]
633
+ targets = row[1].split(',')
634
+ targets.each { |target| step "the target field '#{target}: #{field}'" }
635
+ end
636
+
637
+ @brt.run_transforms
638
+ @brt.targets.keys.each do |target|
639
+ expect(@brt.targets[target].data_subject.df.vectors.to_a).to match_array @brt.targets[target].fields.field_names
640
+ end
641
+ end
642
+
643
+
644
+
629
645
  ### Record-level expectations
630
646
 
631
647
  Then /^the record from source '(.+)' should be (?i)(Retained|Rejected)(?-i)(?: without error|)$/ do |source_name, action|
@@ -73,14 +73,14 @@ class SubJobExampleJob < Remi::Job
73
73
  target :zombified_beers
74
74
 
75
75
  transform :zombification do
76
- # Sub jobs must be executed before their sources are available
77
- beers_job.execute
76
+ # Sub jobs are executed when data from a sub job is requested
77
+ # Here, the sub job beers_job is executed
78
78
  just_beers.df = beer_fridge.df
79
79
 
80
- # Sub job targets must be loaded before they are available to subjobs
80
+ # Data is supplied to the sub job on assignment
81
81
  beers_to_zombify.df = just_beers.df
82
- beers_to_zombify.load
83
- zombify_job.execute
82
+
83
+ # Here, the sub job zombify_job is executed using the data supplied to it above
84
84
  zombified_beers.df = zombie_fridge.df
85
85
  end
86
86
  end