remi 0.0.1 → 0.2.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (60) hide show
  1. checksums.yaml +4 -4
  2. data/.bundle/config +2 -0
  3. data/.gitignore +3 -2
  4. data/.rspec +2 -0
  5. data/.ruby-version +1 -0
  6. data/Gemfile +4 -0
  7. data/Gemfile.lock +123 -0
  8. data/LICENSE.txt +21 -0
  9. data/README.md +94 -3
  10. data/bin/remi +8 -0
  11. data/doc/install-rbenv-os_x.md +47 -0
  12. data/lib/remi.rb +56 -9
  13. data/lib/remi/cli.rb +56 -0
  14. data/lib/remi/core/daru.rb +28 -0
  15. data/lib/remi/core/refinements.rb +21 -0
  16. data/lib/remi/core/string.rb +8 -0
  17. data/lib/remi/cucumber.rb +7 -0
  18. data/lib/remi/cucumber/business_rules.rb +504 -0
  19. data/lib/remi/cucumber/data_source.rb +63 -0
  20. data/lib/remi/data_source.rb +13 -0
  21. data/lib/remi/data_source/csv_file.rb +79 -0
  22. data/lib/remi/data_source/data_frame.rb +10 -0
  23. data/lib/remi/data_source/postgres.rb +58 -0
  24. data/lib/remi/data_source/salesforce.rb +78 -0
  25. data/lib/remi/data_subject.rb +25 -0
  26. data/lib/remi/data_target.rb +15 -0
  27. data/lib/remi/data_target/csv_file.rb +49 -0
  28. data/lib/remi/data_target/data_frame.rb +14 -0
  29. data/lib/remi/data_target/salesforce.rb +49 -0
  30. data/lib/remi/extractor/sftp_file.rb +84 -0
  31. data/lib/remi/field_symbolizers.rb +17 -0
  32. data/lib/remi/job.rb +200 -0
  33. data/lib/remi/lookup/regex_sieve.rb +55 -0
  34. data/lib/remi/project/features/examples.feature +24 -0
  35. data/lib/remi/project/features/formulas.feature +64 -0
  36. data/lib/remi/project/features/sample_job.feature +304 -0
  37. data/lib/remi/project/features/step_definitions/remi_step.rb +310 -0
  38. data/lib/remi/project/features/support/env.rb +10 -0
  39. data/lib/remi/project/features/support/env_app.rb +3 -0
  40. data/lib/remi/project/features/transforms/date_diff.feature +50 -0
  41. data/lib/remi/project/features/transforms/parse_date.feature +34 -0
  42. data/lib/remi/project/features/transforms/prefix.feature +15 -0
  43. data/lib/remi/project/jobs/all_jobs_shared.rb +25 -0
  44. data/lib/remi/project/jobs/copy_source_job.rb +12 -0
  45. data/lib/remi/project/jobs/sample_job.rb +164 -0
  46. data/lib/remi/project/jobs/transforms/date_diff_job.rb +17 -0
  47. data/lib/remi/project/jobs/transforms/parse_date_job.rb +18 -0
  48. data/lib/remi/project/jobs/transforms/prefix_job.rb +16 -0
  49. data/lib/remi/project/jobs/transforms/transform_jobs.rb +3 -0
  50. data/lib/remi/settings.rb +39 -0
  51. data/lib/remi/sf_bulk_helper.rb +265 -0
  52. data/lib/remi/source_to_target_map.rb +93 -0
  53. data/lib/remi/transform.rb +137 -0
  54. data/lib/remi/version.rb +3 -0
  55. data/remi.gemspec +25 -7
  56. data/workbooks/sample_workbook.ipynb +56 -0
  57. data/workbooks/workbook_helper.rb +1 -0
  58. metadata +234 -17
  59. data/lib/noodling.rb +0 -163
  60. data/test/test_NAME.rb +0 -19
@@ -0,0 +1,28 @@
1
+ module Daru
2
+ class DataFrame
3
+ def monkey_dup
4
+ dupdf = Daru::DataFrame.new([], index: self.index)
5
+ self.vectors.each do |v|
6
+ dupdf[v] = self[v]
7
+ end
8
+
9
+ dupdf
10
+ end
11
+
12
+ def monkey_merge(other)
13
+ other.vectors.each do |v|
14
+ self[v] = other[v]
15
+ end
16
+
17
+ self
18
+ end
19
+
20
+ def hash_dump(filename)
21
+ File.write(filename, Marshal.dump(self.to_hash))
22
+ end
23
+
24
+ def self.from_hash_dump(filename)
25
+ Daru::DataFrame.new(Marshal.load(File.read(filename)))
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,21 @@
1
+ module Remi
2
+ module Core
3
+ module Refinements
4
+ refine String do
5
+ def symbolize(symbolizer=nil)
6
+ if symbolizer
7
+ symbolizer.call(self)
8
+ else
9
+ Remi::FieldSymbolizers[:standard].call(self)
10
+ end
11
+ end
12
+ end
13
+
14
+ refine Symbol do
15
+ def symbolize(symbolizer=nil)
16
+ self.to_s.symbolize
17
+ end
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,8 @@
1
+ class String
2
+ # Strip leading whitespace from each line that is the same as the
3
+ # amount of whitespace on the first line of the string.
4
+ # Leaves _additional_ indentation on later lines intact.
5
+ def unindent
6
+ gsub /^#{self[/\A\s*/]}/, ''
7
+ end
8
+ end
@@ -0,0 +1,7 @@
1
+ require 'rspec/expectations'
2
+ require 'cucumber/rspec/doubles'
3
+
4
+ require 'regexp-examples'
5
+
6
+ require_relative 'cucumber/data_source'
7
+ require_relative 'cucumber/business_rules'
@@ -0,0 +1,504 @@
1
+ module Remi::BusinessRules
2
+ using Remi::Core::Refinements
3
+
4
+ def self.parse_full_field(full_field_name)
5
+ full_field_name.split(':').map(&:strip)
6
+ end
7
+
8
+ def self.csv_opt_map
9
+ {
10
+ 'tab' => "\t",
11
+ 'comma' => ',',
12
+ 'pipe' => '|',
13
+ 'double quote' => '"',
14
+ 'single quote' => "'",
15
+ 'windows' => "\r\n",
16
+ 'unix' => "\n",
17
+ 'windows or unix' => :auto,
18
+ }
19
+ end
20
+
21
+
22
+ module ParseFormula
23
+ extend self
24
+
25
+ def is_formula?(arg)
26
+ !base_regex.match(arg).nil?
27
+ end
28
+
29
+ def base_regex
30
+ @base_regex ||= /\A\*(.*)\*\Z/
31
+ end
32
+
33
+ def formulas
34
+ @formulas ||= Remi::Lookup::RegexSieve.new({
35
+ /(yesterday|tomorrow)/i => [:date_reference, :match_single_day],
36
+ /(last|previous|next) (day|month|year|week)/i => [:date_reference, :match_single_unit],
37
+ /(\d+)\s(day|days|month|months|year|years|week|weeks) (ago|from now)/i => [:date_reference, :match_multiple]
38
+ })
39
+ end
40
+
41
+ def parse(form)
42
+ return form unless is_formula?(form)
43
+
44
+ form_opt = formulas[form, :match]
45
+ raise "Unknown formula #{form}" unless form_opt
46
+
47
+ if form_opt[:value][0] == :date_reference
48
+ date_reference(form_opt[:value][1], form_opt[:match])
49
+ end
50
+ end
51
+
52
+
53
+ def date_reference(formula, captured)
54
+ parsed = self.send("date_reference_#{formula}", *captured)
55
+ Date.current.send("#{parsed[:unit]}_#{parsed[:direction]}", parsed[:quantity]).strftime('%Y-%m-%d')
56
+ end
57
+
58
+ def date_reference_match_single_day(form, direction)
59
+ {
60
+ quantity: 1,
61
+ unit: 'days',
62
+ direction: { 'yesterday' => 'ago', 'tomorrow' => 'since' }[direction.downcase]
63
+ }
64
+ end
65
+
66
+ def date_reference_match_single_unit(form, direction, unit)
67
+ {
68
+ quantity: 1,
69
+ unit: unit.downcase.pluralize,
70
+ direction: { 'last' => 'ago', 'previous' => 'ago', 'next' => 'since' }[direction.downcase]
71
+ }
72
+ end
73
+
74
+ def date_reference_match_multiple(form, quantity, unit, direction)
75
+ {
76
+ quantity: quantity.to_i,
77
+ unit: unit.downcase.pluralize,
78
+ direction: { 'ago' => 'ago', 'from now' => 'since' }[direction.downcase]
79
+ }
80
+ end
81
+ end
82
+
83
+ class Tester
84
+
85
+ def initialize(job_name)
86
+ job_class_name = "#{job_name.gsub(/\s/,'')}Job"
87
+ @job = Object.const_get(job_class_name).new
88
+
89
+ @job_sources = DataSubjectCollection.new
90
+ @job_targets = DataSubjectCollection.new
91
+
92
+ @sources = DataSubjectCollection.new
93
+ @targets = DataSubjectCollection.new
94
+ @examples = DataExampleCollection.new
95
+
96
+ @filestore = Filestore.new
97
+ end
98
+
99
+ attr_reader :job
100
+ attr_reader :job_sources
101
+ attr_reader :job_targets
102
+ attr_reader :sources
103
+ attr_reader :targets
104
+ attr_reader :examples
105
+ attr_reader :filestore
106
+
107
+ def add_job_source(name)
108
+ raise "Unknown source #{name} for job" unless @job.methods.include? name.symbolize
109
+ @job_sources.add_subject(name, @job.send(name.symbolize))
110
+ end
111
+
112
+ def add_job_target(name)
113
+ raise "Unknown target #{name} for job" unless @job.methods.include? name.symbolize
114
+ @job_targets.add_subject(name, @job.send(name.symbolize))
115
+ end
116
+
117
+ def set_job_parameter(name, value)
118
+ @job.params[name.to_sym] = value
119
+ end
120
+
121
+ def add_source(name)
122
+ @sources.add_subject(name, @job.send(name.symbolize))
123
+ end
124
+
125
+ def source
126
+ @sources.only
127
+ end
128
+
129
+ def add_target(name)
130
+ @targets.add_subject(name, @job.send(name.symbolize))
131
+ end
132
+
133
+ def target
134
+ @targets.only
135
+ end
136
+
137
+
138
+ def add_example(example_name, example_table)
139
+ @examples.add_example(example_name, example_table)
140
+ end
141
+
142
+ def run_transforms
143
+ @job.run_all_transforms
144
+ end
145
+ end
146
+
147
+
148
+
149
+
150
+ class DataSubjectCollection
151
+ include Enumerable
152
+
153
+ def initialize
154
+ @subjects = {}
155
+ end
156
+
157
+ def [](subject_name)
158
+ @subjects[subject_name]
159
+ end
160
+
161
+ def each(&block)
162
+ @subjects.each &block
163
+ end
164
+
165
+ def keys
166
+ @subjects.keys
167
+ end
168
+
169
+ def add_subject(subject_name, subject)
170
+ @subjects[subject_name] ||= DataSubject.new(subject)
171
+ end
172
+
173
+ def add_field(full_field_name)
174
+ if full_field_name.include? ':'
175
+ subject_name, field_name = *Remi::BusinessRules.parse_full_field(full_field_name)
176
+ @subjects[subject_name].add_field(field_name)
177
+ else
178
+ @subjects.each do |subject_name, subject|
179
+ subject.add_field(full_field_name)
180
+ end
181
+ end
182
+ end
183
+
184
+ def only
185
+ raise "Multiple subjects defined: #{keys}" unless @subjects.size == 1
186
+ @subjects.values.first
187
+ end
188
+
189
+ def fields
190
+ dfc = DataFieldCollection.new
191
+ @subjects.each do |subject_name, subject|
192
+ subject.fields.each { |field_name, field| dfc.add_field(subject, field_name) }
193
+ end
194
+ dfc
195
+ end
196
+
197
+ def size
198
+ @subjects.size
199
+ end
200
+
201
+ def total_size
202
+ @subjects.reduce(0) { |sum, (name, subject)| sum += subject.size }
203
+ end
204
+ end
205
+
206
+
207
+ class DataSubject
208
+ def initialize(subject)
209
+ @data_obj = subject
210
+ @fields = DataFieldCollection.new
211
+
212
+ stub_data
213
+ end
214
+
215
+ attr_reader :data_obj
216
+
217
+ def add_field(field_name)
218
+ @fields.add_field(self, field_name)
219
+ end
220
+
221
+ def field
222
+ @fields.only
223
+ end
224
+
225
+ def fields
226
+ @fields
227
+ end
228
+
229
+ def size
230
+ @data_obj.df.size
231
+ end
232
+
233
+ # For debugging only
234
+ def _df
235
+ @data_obj.df
236
+ end
237
+
238
+
239
+
240
+ def stub_data
241
+ @data_obj.stub_df if @data_obj.respond_to? :stub_df
242
+ end
243
+
244
+ def stub_data_with(example)
245
+ stub_data
246
+ @data_obj.df = example.to_df(@data_obj.df.row[0].to_hash, field_symbolizer: @data_obj.field_symbolizer)
247
+ end
248
+
249
+
250
+ def replicate_rows(n_rows)
251
+ replicated_df = Daru::DataFrame.new([], order: @data_obj.df.vectors.to_a)
252
+ @data_obj.df.each do |vector|
253
+ replicated_df[vector.name] = vector.to_a * n_rows
254
+ end
255
+ @data_obj.df = replicated_df
256
+ end
257
+
258
+ def cumulative_dist_from_freq_table(table, freq_field: 'frequency')
259
+ cumulative_dist = {}
260
+ freq_total = 0
261
+ table.hashes.each do |row|
262
+ low = freq_total
263
+ high = freq_total + row[freq_field].to_f
264
+ freq_total = high
265
+ cumulative_dist[(low...high)] = row.tap { |r| r.delete(freq_field) }
266
+ end
267
+ cumulative_dist
268
+ end
269
+
270
+ def generate_values_from_cumulative_dist(n_records, cumulative_dist)
271
+ # Use the same key for reproducible tests
272
+ psuedorand = Random.new(3856382695386)
273
+
274
+ 1.upto(n_records).reduce({}) do |h, idx|
275
+ r = psuedorand.rand
276
+ row_as_hash = cumulative_dist.select { |range| range.include? r }.values.first
277
+ row_as_hash.each do |field_name, value|
278
+ h[field_name] ||= []
279
+ h[field_name] << value
280
+ end
281
+ h
282
+ end
283
+ end
284
+
285
+ def distribute_values(table)
286
+ cumulative_dist = cumulative_dist_from_freq_table(table)
287
+ generated_data = generate_values_from_cumulative_dist(@data_obj.df.size, cumulative_dist)
288
+
289
+ generated_data.each do |field_name, data_array|
290
+ vector_name = fields[field_name].name
291
+ @data_obj.df[vector_name] = Daru::Vector.new(data_array, index: @data_obj.df.index)
292
+ end
293
+ end
294
+
295
+ def freq_by(*field_names)
296
+ @data_obj.df.group_by(field_names).size * 1.0 / @data_obj.df.size
297
+ end
298
+
299
+ def mock_extractor(filestore)
300
+ extractor = class << @data_obj.extractor; self; end
301
+
302
+ extractor.send(:define_method, :all_entries, ->() { filestore.sftp_entries })
303
+ extractor.send(:define_method, :download, ->(to_download) { to_download.map { |e| e.name } })
304
+ end
305
+
306
+ def extract
307
+ @data_obj.extractor.extract
308
+ end
309
+
310
+ def csv_options
311
+ @data_obj.csv_options
312
+ end
313
+
314
+ end
315
+
316
+
317
+ class DataFieldCollection
318
+ include Enumerable
319
+
320
+ def initialize
321
+ @fields = {}
322
+ end
323
+
324
+ def [](field_name)
325
+ @fields[field_name]
326
+ end
327
+
328
+ def each(&block)
329
+ @fields.each(&block)
330
+ end
331
+
332
+ def keys
333
+ @fields.keys
334
+ end
335
+
336
+ def names
337
+ @fields.values.map(&:name)
338
+ end
339
+
340
+ def add_field(subject, field_name)
341
+ @fields[field_name] = DataField.new(subject.data_obj, field_name) unless @fields.include? field_name
342
+ end
343
+
344
+ def only
345
+ raise "Multiple subject fields defined: #{keys}" if @fields.size > 1
346
+ @fields.values.first
347
+ end
348
+
349
+ # All values get tested as strings
350
+ def values
351
+ @fields.map { |field_name, field| field.values.map(&:to_s) }.transpose
352
+ end
353
+ end
354
+
355
+
356
+ class DataField
357
+ def initialize(subject, field_name)
358
+ @subject = subject
359
+ @field_name = field_name.symbolize(subject.field_symbolizer)
360
+ end
361
+
362
+ def name
363
+ @field_name
364
+ end
365
+
366
+ def metadata
367
+ @subject.fields[name]
368
+ end
369
+
370
+ def vector
371
+ @subject.df[@field_name]
372
+ end
373
+
374
+ def value
375
+ v = vector.to_a.uniq
376
+ raise "Multiple unique values found in subject data for field #{@field_name}" if v.size > 1
377
+ v.first
378
+ end
379
+
380
+ def values
381
+ vector.to_a.map(&:to_s)
382
+ end
383
+
384
+ def value=(arg)
385
+ vector.recode! { |v| arg }
386
+ end
387
+ end
388
+
389
+
390
+ class DataExampleCollection
391
+ include Enumerable
392
+
393
+ def initialize
394
+ @examples = {}
395
+ end
396
+
397
+ def [](example_name)
398
+ @examples[example_name]
399
+ end
400
+
401
+ def each(&block)
402
+ @examples.each(&block)
403
+ end
404
+
405
+ def keys
406
+ @examples.keys
407
+ end
408
+
409
+ def add_example(example_name, example_table)
410
+ @examples[example_name] = DataExample.new(example_table) unless @examples.include? example_name
411
+ end
412
+ end
413
+
414
+
415
+ class DataExample
416
+ def initialize(table)
417
+ @table = table
418
+ end
419
+
420
+ def to_df(seed_hash, field_symbolizer:)
421
+ table_headers = @table.headers.map { |h| h.symbolize(field_symbolizer) }
422
+ df = Daru::DataFrame.new([], order: seed_hash.keys | table_headers)
423
+ @table.hashes.each do |example_row|
424
+ example_row_sym = example_row.reduce({}) do |h, (k,v)|
425
+ h[k.symbolize(field_symbolizer)] = ParseFormula.parse(v)
426
+ h
427
+ end
428
+ df.add_row(seed_hash.merge(example_row_sym))
429
+ end
430
+ df
431
+ end
432
+ end
433
+
434
+
435
+ class Filestore
436
+ def initialize
437
+ @files = []
438
+ @delivered = {}
439
+ end
440
+
441
+ attr_reader :sftp_entries
442
+
443
+ def pattern(pattern)
444
+ @pattern = pattern
445
+ end
446
+
447
+ def anti_pattern(pattern)
448
+ @pattern = /^ThisBetterNeverMatchAnythingOrIWillShootYou\d{8}Times$/
449
+ end
450
+
451
+ def delivered_since(date_time)
452
+ @delivered = { :since => date_time }
453
+ end
454
+
455
+ def delivered_before(date_time)
456
+ @delivered = { :before => date_time }
457
+ end
458
+
459
+ def latest
460
+ @files.max_by { |f| f[:attributes][:createdtime] }[:name]
461
+ end
462
+
463
+ def generate
464
+ psuedorand = Random.new(4985674985672348954987589429)
465
+
466
+ generate_files_with_pattern
467
+ @files.map! do |file|
468
+ date_method = @delivered.keys.first
469
+ if date_method == :since
470
+ file[:attributes][:createdtime] = @delivered[:since] + 10 + psuedorand.rand * 100
471
+ elsif date_method == :before
472
+ file[:attributes][:createdtime] = @delivered[:since] - 10 - psuedorand.rand * 100
473
+ else
474
+ file[:attributes][:createdtime] = Time.now - 10 - psuedorand.rand * 100
475
+ end
476
+ file
477
+ end
478
+ end
479
+
480
+ def sftp_entries
481
+ @files.map do |file|
482
+ Net::SFTP::Protocol::V04::Name.new(
483
+ file[:name],
484
+ Net::SFTP::Protocol::V04::Attributes.new(createtime: file[:attributes][:createdtime])
485
+ )
486
+ end
487
+ end
488
+
489
+ private
490
+
491
+ def generate_files_with_pattern
492
+ filenames = 1.upto(5).map { |f| @pattern.random_example }.uniq
493
+
494
+ @files = filenames.map do |fname|
495
+ {
496
+ name: fname,
497
+ attributes: {
498
+ createdtime: nil
499
+ }
500
+ }
501
+ end
502
+ end
503
+ end
504
+ end