remi 0.3.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 71164673ece850e218a1ef18a72aa02f4ca5d065
4
- data.tar.gz: b52e6e86bc4995f2caf79dad79a7f0a195cdb9f9
3
+ metadata.gz: ea2d2971479e9e0dfcc5de4dd01ac13f5274a6f8
4
+ data.tar.gz: ea5e3a3280613d00ae29f5265c342740973ea57c
5
5
  SHA512:
6
- metadata.gz: 708b6fdf566acb525caa85bc038685bad46c3999f65045b5281c9393d731f5a3cdb49ac30a7ecde12f16abab5a27364449a83fc840adfa531e2a04a783fcd69a
7
- data.tar.gz: 48730dd300bb84dadb93d911e37baa274958d0d1deaddcbc769aa43a7f7c1887ffbcf025ce43d4188e09630de31868acc6315ae2559dbd80853febc42e13b5d7
6
+ metadata.gz: a09f8f926a99891356bcd6363ef4729b3a27bcf9cbfacce2a6bc8a8b60a262cfe83fe40d1e0f1f0c79945693aacb0fb1099b5bad299e022511846730e98642fe
7
+ data.tar.gz: d160e9840162558b1d9e203825f8e724ce1626e9f8a6922b86f894249b78c5c6680b37fef7e24574d794aac41db025bd887c707df35f6485a3540914552a3293
data/.bundle/config CHANGED
@@ -1,2 +1,2 @@
1
1
  ---
2
- BUNDLE_DISABLE_SHARED_GEMS: '1'
2
+ BUNDLE_DISABLE_SHARED_GEMS: "true"
data/Gemfile CHANGED
@@ -2,6 +2,7 @@
2
2
  source 'https://rubygems.org'
3
3
 
4
4
  gemspec
5
+ gem 'google-api-client', '~> 0.9'
5
6
  gem 'daru', '0.1.4.1', git: 'git@github.com:inside-track/daru.git', branch: '0.1.4.1-Remi'
6
7
  gem 'restforce', '~> 2.1'
7
8
  gem 'salesforce_bulk_api', git: 'git@github.com:inside-track/salesforce_bulk_api.git', branch: 'master'
data/Gemfile.lock CHANGED
@@ -18,7 +18,7 @@ GIT
18
18
  PATH
19
19
  remote: .
20
20
  specs:
21
- remi (0.3.0)
21
+ remi (0.3.1)
22
22
  activesupport (~> 4.2)
23
23
  bond (~> 0.5)
24
24
  cucumber (~> 2.1)
@@ -39,6 +39,7 @@ GEM
39
39
  minitest (~> 5.1)
40
40
  thread_safe (~> 0.3, >= 0.3.4)
41
41
  tzinfo (~> 1.1)
42
+ addressable (2.4.0)
42
43
  aws-sdk (2.3.5)
43
44
  aws-sdk-resources (= 2.3.5)
44
45
  aws-sdk-core (2.3.5)
@@ -69,7 +70,26 @@ GEM
69
70
  faraday (>= 0.7.4, < 0.10)
70
71
  gherkin (3.2.0)
71
72
  github-markup (1.4.0)
73
+ google-api-client (0.9.15)
74
+ addressable (~> 2.3)
75
+ googleauth (~> 0.5)
76
+ httpclient (~> 2.7)
77
+ hurley (~> 0.1)
78
+ memoist (~> 0.11)
79
+ mime-types (>= 1.6)
80
+ representable (~> 2.3.0)
81
+ retriable (~> 2.0)
82
+ googleauth (0.5.1)
83
+ faraday (~> 0.9)
84
+ jwt (~> 1.4)
85
+ logging (~> 2.0)
86
+ memoist (~> 0.12)
87
+ multi_json (~> 1.11)
88
+ os (~> 0.9)
89
+ signet (~> 0.7)
72
90
  hashie (3.4.3)
91
+ httpclient (2.8.2.4)
92
+ hurley (0.2)
73
93
  i18n (0.7.0)
74
94
  iruby (0.2.7)
75
95
  bond (~> 0.5)
@@ -80,6 +100,15 @@ GEM
80
100
  json_pure (>= 1.8.1)
81
101
  json (1.8.3)
82
102
  json_pure (1.8.3)
103
+ jwt (1.5.6)
104
+ little-plugger (1.1.4)
105
+ logging (2.1.0)
106
+ little-plugger (~> 1.1)
107
+ multi_json (~> 1.10)
108
+ memoist (0.15.0)
109
+ mime-types (3.1)
110
+ mime-types-data (~> 3.2015)
111
+ mime-types-data (3.2016.0521)
83
112
  mimemagic (0.3.1)
84
113
  minitest (5.8.4)
85
114
  multi_json (1.11.2)
@@ -88,16 +117,20 @@ GEM
88
117
  net-sftp (2.1.2)
89
118
  net-ssh (>= 2.6.5)
90
119
  net-ssh (3.1.1)
120
+ os (0.9.6)
91
121
  pg (0.18.4)
92
122
  rbczmq (1.7.9)
93
123
  redcarpet (3.3.4)
94
124
  regex_sieve (0.1.0)
95
125
  regexp-examples (1.2.0)
126
+ representable (2.3.0)
127
+ uber (~> 0.0.7)
96
128
  restforce (2.2.0)
97
129
  faraday (~> 0.9.0)
98
130
  faraday_middleware (>= 0.8.8)
99
131
  hashie (>= 1.2.0, < 4.0)
100
132
  json (>= 1.7.5, < 1.9.0)
133
+ retriable (2.1.0)
101
134
  rspec (3.4.0)
102
135
  rspec-core (~> 3.4.0)
103
136
  rspec-expectations (~> 3.4.0)
@@ -111,9 +144,15 @@ GEM
111
144
  diff-lcs (>= 1.2.0, < 2.0)
112
145
  rspec-support (~> 3.4.0)
113
146
  rspec-support (3.4.1)
147
+ signet (0.7.3)
148
+ addressable (~> 2.3)
149
+ faraday (~> 0.9)
150
+ jwt (~> 1.5)
151
+ multi_json (~> 1.10)
114
152
  thread_safe (0.3.5)
115
153
  tzinfo (1.2.2)
116
154
  thread_safe (~> 0.1)
155
+ uber (0.0.15)
117
156
  xml-simple (1.1.5)
118
157
  yard (0.9.0)
119
158
 
@@ -123,13 +162,14 @@ PLATFORMS
123
162
  DEPENDENCIES
124
163
  aws-sdk (~> 2.3)
125
164
  daru (= 0.1.4.1)!
126
- github-markup
165
+ github-markup (~> 1.4)
166
+ google-api-client (~> 0.9)
127
167
  iruby (= 0.2.7)
128
- redcarpet
168
+ redcarpet (~> 3.3)
129
169
  remi!
130
170
  restforce (~> 2.1)
131
171
  salesforce_bulk_api!
132
- yard
172
+ yard (~> 0.9)
133
173
 
134
174
  BUNDLED WITH
135
- 1.11.2
175
+ 1.13.5
data/README.md CHANGED
@@ -24,6 +24,11 @@ Remi will follow [semantic versioning](http://semver.org/) principles.
24
24
  Of course, while we're still on major version zero, little effort will
25
25
  be made to maintain backward compatibility.
26
26
 
27
+ The data transformation layer is built on top of
28
+ [Daru dataframe](https://github.com/v0dro/daru). Familiarity with
29
+ Daru dataframes is essential for writing complex transformations in
30
+ Remi.
31
+
27
32
  ## Getting Started
28
33
 
29
34
  Add the gem to your Gemfile, `bundle install`, and then initialize your repository as
@@ -40,8 +45,248 @@ sure this works by running
40
45
 
41
46
  All of the test should pass.
42
47
 
48
+ ## Remi Jobs
49
+
50
+ A Remi job describes the data sources that will be used to collect
51
+ data, the transformations that will be performed on the data, and the
52
+ data targets that will be populated when all transformations are
53
+ complete. With Remi, an ETL process is defined in a class that
54
+ inherits from the `Remi::Job` class.
55
+
56
+ ### Hello World
57
+
58
+ A very simple "Hello World" example of a Remi job would be
59
+
60
+ ````ruby
61
+ class HelloWorldJob < Remi::Job
62
+ transform :say_hi do
63
+ puts "Hello World"
64
+ end
65
+ end
66
+ ````
67
+
68
+ This job doesn't make use of any data subjects (data sources or data
69
+ targets), but it does define a single data transform called `:say_hi`.
70
+ The full job can be executed by calling the `#execute` method on an
71
+ instance of the `HelloWorldJob` class
72
+
73
+ ````ruby
74
+ job = HelloWorldJob.new
75
+ job.execute
76
+ #=> "Hello World"
77
+ ````
78
+
79
+ The transform called `say_hi` is just a method of the `HelloWorldJob`
80
+ class representing a job transform object. Multiple transforms can be
81
+ defined in a Remi job. To execute a specific transform we can call that transform by
82
+ name using
83
+
84
+ ````ruby
85
+ job = HelloWorldJob.new
86
+ job.say_hi.execute
87
+ #=> "Hello World"
88
+ ````
89
+
90
+ ### A more complete example
91
+
92
+ Suppose we have a database containing data on beer sales. It's a
93
+ normalized database where we store data on individual beers sold in a
94
+ `beer_sales_fact` table and information on the details of the beer in
95
+ a `beers_dim` table. We'd like to extract data from both of these
96
+ sources, combine them into a single flattened table and save it as a
97
+ CSV file. This operation could be performed with the following Remi
98
+ job. (Of course, if this were a real world problem, we'd do the join
99
+ in the database before extracting; this is a contrived example to show
100
+ how one can combine data from multiple arbitrary sources).
101
+
102
+
103
+ ````ruby
104
+ class DenormalizeBeersJob < Remi::Job
105
+ source :beer_sales_fact do
106
+ extractor Remi::Extractor::Postgres.new(
107
+ credentials: {
108
+ dbname: 'my_local_db'
109
+ },
110
+ query: 'SELECT beer_id, sold_date, quantity FROM beer_sales_fact'
111
+ )
112
+ parser Remi::Parser::Postgres.new
113
+
114
+ fields(
115
+ {
116
+ :beer_id => {},
117
+ :sold_at => { type: :date, in_format: '%Y-%m-%d' },
118
+ :quantity => { type: :integer }
119
+ }
120
+ )
121
+ end
122
+
123
+ source :beers_dim do
124
+ extractor Remi::Extractor::Postgres.new(
125
+ credentials: {
126
+ dbname: 'my_local_db'
127
+ },
128
+ query: 'SELECT beer_id, name, price_per_unit FROM beers_dim'
129
+ )
130
+ parser Remi::Parser::Postgres.new
131
+
132
+ fields(
133
+ {
134
+ :beer_id => {},
135
+ :name => {},
136
+ :price_per_unit => { type: :decimal, scale: 2 }
137
+ }
138
+ )
139
+ end
140
+
141
+ target :flat_beer_file do
142
+ encoder Remi::Encoder::CsvFile.new
143
+ loader Remi::Loader::LocalFile.new(
144
+ path: 'flat_beers.csv'
145
+ )
146
+ end
147
+
148
+ transform :type_enforcement do
149
+ beer_sales_fact.enforce_types
150
+ beers_dim.enforce_types
151
+ end
152
+
153
+ transform :flatten do
154
+ flat_beer_file.df = beer_sales_fact.df.join(flat_beer_file.df, on: [:beer_id], how: :inner)
155
+
156
+ Remi::SourceToTargetMap.apply(flat_beer_file.df) do
157
+ map source(:quantity, :price_per_unit) .target(:total_price)
158
+ .transform(->(row) {
159
+ row[:quantity] * row[:price_per_unit]
160
+ })
161
+ end
162
+ end
163
+ end
164
+ ````
165
+
166
+ ### Components of a Remi Job
167
+
168
+ A Remi job is composed of one or more of the following elements, which are described
169
+ in more detail below. All of these elements are defined using class methods (part
170
+ of `Remi::Job`). Each of the elements is given a name and defined in a block.
171
+
172
+ * Data Subjects - A data subject is either a data source or a data target.
173
+ * Data Sources - A data source describes where data is extracted from.
174
+ ````ruby
175
+ source :my_source do
176
+ # ... source definition
177
+ end
178
+ ````
179
+ * Data Targets - A data target describes where data is loaded to.
180
+ ````ruby
181
+ target :my_target do
182
+ # ... target definition
183
+ end
184
+ ````
185
+
186
+ * Transforms - A transform is essentially arbitrary block of of Ruby
187
+ code, but is typically used to transform data sources into data targets.
188
+ ````ruby
189
+ transform :my_transform do
190
+ # ... lots of code
191
+ end
192
+ ````
193
+
194
+ * Job Parameters - A job parameter is a memoized block of code
195
+ (similar to RSpecs' `let` method) that is used to configure a job and may
196
+ be overridden at runtime if needed.
197
+ ````ruby
198
+ param :my_param do
199
+ # ... the return value of this block is memoized
200
+ end
201
+ ````
202
+
203
+ * Sub Transforms - Sub transforms are essentially transforms, but they are NOT
204
+ automatically executed when the job is executed. Instead, they must be _imported_
205
+ in a transform. They are meant to be reusable bits of transform code.
206
+ ````ruby
207
+ sub_transform :my_sub_transform do
208
+ #... sub_transform stuff
209
+ end
210
+ ````
211
+
212
+ * Sub Jobs - Sub jobs are simply references to other Remi jobs that may be executed
213
+ within the current job.
214
+ ````ruby
215
+ sub_job :my_sub_job { MySubJob.new }
216
+ ````
217
+
218
+
219
+
220
+ ### Execution Plan
221
+
222
+ The `DenormalizeBeersJob` example above can be executed using
223
+
224
+ ````ruby
225
+ job = DenormalizeBeersJob.new
226
+ job.execute
227
+ ````
228
+
229
+ Calling `#execute` on an instance of a job does the following, in this order:
230
+ 1. All transforms defined in the job (via `transform :name do ... end`) are executed
231
+ in the order they were defined in the class definition.
232
+ 2. All data targets are loaded in the order they are defined in the job.
233
+
234
+ Note that data sources are not extracted until the moment the data is
235
+ needed in a transform. If the source data is never referenced in a
236
+ transform, it is never extracted.
237
+
238
+
239
+ ## Data Subjects
240
+
241
+ A _data subject_ refers to either a data source or a data target.
242
+ Either way, a data subject is associated with a data frame. Currently
243
+ the only data frames supported are
244
+ [Daru data frames](https://github.com/v0dro/daru), but support for
245
+ other data frames may be developed in the future. The data frame associated
246
+ with a data subject is accessed with the `#df` method and assigned with the `#df=`
247
+ method.
248
+ ````ruby
249
+ my_data_subject.df #=> Daru::DataFrame
250
+ my_data_subject.df = Daru::DataFrame.new(...)
251
+ ````
252
+
253
+ Additionally, all data subjects can be associated with a set of fields and field
254
+ metadata. Associating a data subject with feild data allows us to develop
255
+ generic ETL routines that triggered by arbitrary metadata that may be associated
256
+ with a field.
257
+
258
+ ### Sources
259
+
260
+ ### Targets
261
+
262
+ ### Field Metadata
263
+
264
+
265
+
266
+ ## Available Data Subjects
267
+
268
+ * CSV Files
269
+ * DataFrames
270
+ * None
271
+ * Local files
272
+ * SFTP Files
273
+ * S3 Files
274
+ * Salesforce
275
+ * Postgres
276
+
277
+ ## Transforms
278
+
279
+ ## Sub Jobs
280
+
281
+ ## Job Parameters
282
+
283
+ ## Sub Transforms
284
+
43
285
  ## Transforming Data
44
286
 
287
+ When `#execute` is called on an instance of a `Remi::Job`, all transforms are executed in
288
+ the order defined in the class
289
+
45
290
  TODO:
46
291
 
47
292
  Describe Daru foundation
@@ -61,6 +61,7 @@ end
61
61
 
62
62
  Then /^the file that comes last in an alphanumeric sort by group will be downloaded for processing$/ do
63
63
  expect(@brt.source.data_subject.extractors.map(&:most_recent_by)).to include :name
64
+ expect(@brt.source.data_subject.extractors.map(&:most_recent_only)).not_to include false
64
65
  end
65
66
 
66
67
  Then /^the file is uploaded to the remote path "([^"]+)"$/ do |remote_path|
@@ -626,6 +627,21 @@ Then /^only the following fields should be present on the target:$/ do |table|
626
627
  expect(@brt.target.data_subject.df.vectors.to_a).to match_array @brt.target.fields.field_names
627
628
  end
628
629
 
630
+ Then /^only the following fields should be present on the targets:$/ do |table|
631
+ table.rows.each do |row|
632
+ field = row[0]
633
+ targets = row[1].split(',')
634
+ targets.each { |target| step "the target field '#{target}: #{field}'" }
635
+ end
636
+
637
+ @brt.run_transforms
638
+ @brt.targets.keys.each do |target|
639
+ expect(@brt.targets[target].data_subject.df.vectors.to_a).to match_array @brt.targets[target].fields.field_names
640
+ end
641
+ end
642
+
643
+
644
+
629
645
  ### Record-level expectations
630
646
 
631
647
  Then /^the record from source '(.+)' should be (?i)(Retained|Rejected)(?-i)(?: without error|)$/ do |source_name, action|
@@ -73,14 +73,14 @@ class SubJobExampleJob < Remi::Job
73
73
  target :zombified_beers
74
74
 
75
75
  transform :zombification do
76
- # Sub jobs must be executed before their sources are available
77
- beers_job.execute
76
+ # Sub jobs are executed when data from a sub job is requested
77
+ # Here, the sub job beers_job is executed
78
78
  just_beers.df = beer_fridge.df
79
79
 
80
- # Sub job targets must be loaded before they are available to subjobs
80
+ # Data is supplied to the sub job on assignment
81
81
  beers_to_zombify.df = just_beers.df
82
- beers_to_zombify.load
83
- zombify_job.execute
82
+
83
+ # Here, the sub job zombify_job is executed using the data supplied to it above
84
84
  zombified_beers.df = zombie_fridge.df
85
85
  end
86
86
  end