remi 0.0.1 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. checksums.yaml +4 -4
  2. data/.bundle/config +2 -0
  3. data/.gitignore +3 -2
  4. data/.rspec +2 -0
  5. data/.ruby-version +1 -0
  6. data/Gemfile +4 -0
  7. data/Gemfile.lock +123 -0
  8. data/LICENSE.txt +21 -0
  9. data/README.md +94 -3
  10. data/bin/remi +8 -0
  11. data/doc/install-rbenv-os_x.md +47 -0
  12. data/lib/remi.rb +56 -9
  13. data/lib/remi/cli.rb +56 -0
  14. data/lib/remi/core/daru.rb +28 -0
  15. data/lib/remi/core/refinements.rb +21 -0
  16. data/lib/remi/core/string.rb +8 -0
  17. data/lib/remi/cucumber.rb +7 -0
  18. data/lib/remi/cucumber/business_rules.rb +504 -0
  19. data/lib/remi/cucumber/data_source.rb +63 -0
  20. data/lib/remi/data_source.rb +13 -0
  21. data/lib/remi/data_source/csv_file.rb +79 -0
  22. data/lib/remi/data_source/data_frame.rb +10 -0
  23. data/lib/remi/data_source/postgres.rb +58 -0
  24. data/lib/remi/data_source/salesforce.rb +78 -0
  25. data/lib/remi/data_subject.rb +25 -0
  26. data/lib/remi/data_target.rb +15 -0
  27. data/lib/remi/data_target/csv_file.rb +49 -0
  28. data/lib/remi/data_target/data_frame.rb +14 -0
  29. data/lib/remi/data_target/salesforce.rb +49 -0
  30. data/lib/remi/extractor/sftp_file.rb +84 -0
  31. data/lib/remi/field_symbolizers.rb +17 -0
  32. data/lib/remi/job.rb +200 -0
  33. data/lib/remi/lookup/regex_sieve.rb +55 -0
  34. data/lib/remi/project/features/examples.feature +24 -0
  35. data/lib/remi/project/features/formulas.feature +64 -0
  36. data/lib/remi/project/features/sample_job.feature +304 -0
  37. data/lib/remi/project/features/step_definitions/remi_step.rb +310 -0
  38. data/lib/remi/project/features/support/env.rb +10 -0
  39. data/lib/remi/project/features/support/env_app.rb +3 -0
  40. data/lib/remi/project/features/transforms/date_diff.feature +50 -0
  41. data/lib/remi/project/features/transforms/parse_date.feature +34 -0
  42. data/lib/remi/project/features/transforms/prefix.feature +15 -0
  43. data/lib/remi/project/jobs/all_jobs_shared.rb +25 -0
  44. data/lib/remi/project/jobs/copy_source_job.rb +12 -0
  45. data/lib/remi/project/jobs/sample_job.rb +164 -0
  46. data/lib/remi/project/jobs/transforms/date_diff_job.rb +17 -0
  47. data/lib/remi/project/jobs/transforms/parse_date_job.rb +18 -0
  48. data/lib/remi/project/jobs/transforms/prefix_job.rb +16 -0
  49. data/lib/remi/project/jobs/transforms/transform_jobs.rb +3 -0
  50. data/lib/remi/settings.rb +39 -0
  51. data/lib/remi/sf_bulk_helper.rb +265 -0
  52. data/lib/remi/source_to_target_map.rb +93 -0
  53. data/lib/remi/transform.rb +137 -0
  54. data/lib/remi/version.rb +3 -0
  55. data/remi.gemspec +25 -7
  56. data/workbooks/sample_workbook.ipynb +56 -0
  57. data/workbooks/workbook_helper.rb +1 -0
  58. metadata +234 -17
  59. data/lib/noodling.rb +0 -163
  60. data/test/test_NAME.rb +0 -19
@@ -0,0 +1,28 @@
1
+ module Daru
2
+ class DataFrame
3
+ def monkey_dup
4
+ dupdf = Daru::DataFrame.new([], index: self.index)
5
+ self.vectors.each do |v|
6
+ dupdf[v] = self[v]
7
+ end
8
+
9
+ dupdf
10
+ end
11
+
12
+ def monkey_merge(other)
13
+ other.vectors.each do |v|
14
+ self[v] = other[v]
15
+ end
16
+
17
+ self
18
+ end
19
+
20
+ def hash_dump(filename)
21
+ File.write(filename, Marshal.dump(self.to_hash))
22
+ end
23
+
24
+ def self.from_hash_dump(filename)
25
+ Daru::DataFrame.new(Marshal.load(File.read(filename)))
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,21 @@
1
+ module Remi
2
+ module Core
3
+ module Refinements
4
+ refine String do
5
+ def symbolize(symbolizer=nil)
6
+ if symbolizer
7
+ symbolizer.call(self)
8
+ else
9
+ Remi::FieldSymbolizers[:standard].call(self)
10
+ end
11
+ end
12
+ end
13
+
14
+ refine Symbol do
15
+ def symbolize(symbolizer=nil)
16
+ self.to_s.symbolize
17
+ end
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,8 @@
1
+ class String
2
+ # Strip leading whitespace from each line that is the same as the
3
+ # amount of whitespace on the first line of the string.
4
+ # Leaves _additional_ indentation on later lines intact.
5
+ def unindent
6
+ gsub /^#{self[/\A\s*/]}/, ''
7
+ end
8
+ end
@@ -0,0 +1,7 @@
1
+ require 'rspec/expectations'
2
+ require 'cucumber/rspec/doubles'
3
+
4
+ require 'regexp-examples'
5
+
6
+ require_relative 'cucumber/data_source'
7
+ require_relative 'cucumber/business_rules'
@@ -0,0 +1,504 @@
1
+ module Remi::BusinessRules
2
+ using Remi::Core::Refinements
3
+
4
+ def self.parse_full_field(full_field_name)
5
+ full_field_name.split(':').map(&:strip)
6
+ end
7
+
8
+ def self.csv_opt_map
9
+ {
10
+ 'tab' => "\t",
11
+ 'comma' => ',',
12
+ 'pipe' => '|',
13
+ 'double quote' => '"',
14
+ 'single quote' => "'",
15
+ 'windows' => "\r\n",
16
+ 'unix' => "\n",
17
+ 'windows or unix' => :auto,
18
+ }
19
+ end
20
+
21
+
22
+ module ParseFormula
23
+ extend self
24
+
25
+ def is_formula?(arg)
26
+ !base_regex.match(arg).nil?
27
+ end
28
+
29
+ def base_regex
30
+ @base_regex ||= /\A\*(.*)\*\Z/
31
+ end
32
+
33
+ def formulas
34
+ @formulas ||= Remi::Lookup::RegexSieve.new({
35
+ /(yesterday|tomorrow)/i => [:date_reference, :match_single_day],
36
+ /(last|previous|next) (day|month|year|week)/i => [:date_reference, :match_single_unit],
37
+ /(\d+)\s(day|days|month|months|year|years|week|weeks) (ago|from now)/i => [:date_reference, :match_multiple]
38
+ })
39
+ end
40
+
41
+ def parse(form)
42
+ return form unless is_formula?(form)
43
+
44
+ form_opt = formulas[form, :match]
45
+ raise "Unknown formula #{form}" unless form_opt
46
+
47
+ if form_opt[:value][0] == :date_reference
48
+ date_reference(form_opt[:value][1], form_opt[:match])
49
+ end
50
+ end
51
+
52
+
53
+ def date_reference(formula, captured)
54
+ parsed = self.send("date_reference_#{formula}", *captured)
55
+ Date.current.send("#{parsed[:unit]}_#{parsed[:direction]}", parsed[:quantity]).strftime('%Y-%m-%d')
56
+ end
57
+
58
+ def date_reference_match_single_day(form, direction)
59
+ {
60
+ quantity: 1,
61
+ unit: 'days',
62
+ direction: { 'yesterday' => 'ago', 'tomorrow' => 'since' }[direction.downcase]
63
+ }
64
+ end
65
+
66
+ def date_reference_match_single_unit(form, direction, unit)
67
+ {
68
+ quantity: 1,
69
+ unit: unit.downcase.pluralize,
70
+ direction: { 'last' => 'ago', 'previous' => 'ago', 'next' => 'since' }[direction.downcase]
71
+ }
72
+ end
73
+
74
+ def date_reference_match_multiple(form, quantity, unit, direction)
75
+ {
76
+ quantity: quantity.to_i,
77
+ unit: unit.downcase.pluralize,
78
+ direction: { 'ago' => 'ago', 'from now' => 'since' }[direction.downcase]
79
+ }
80
+ end
81
+ end
82
+
83
+ class Tester
84
+
85
+ def initialize(job_name)
86
+ job_class_name = "#{job_name.gsub(/\s/,'')}Job"
87
+ @job = Object.const_get(job_class_name).new
88
+
89
+ @job_sources = DataSubjectCollection.new
90
+ @job_targets = DataSubjectCollection.new
91
+
92
+ @sources = DataSubjectCollection.new
93
+ @targets = DataSubjectCollection.new
94
+ @examples = DataExampleCollection.new
95
+
96
+ @filestore = Filestore.new
97
+ end
98
+
99
+ attr_reader :job
100
+ attr_reader :job_sources
101
+ attr_reader :job_targets
102
+ attr_reader :sources
103
+ attr_reader :targets
104
+ attr_reader :examples
105
+ attr_reader :filestore
106
+
107
+ def add_job_source(name)
108
+ raise "Unknown source #{name} for job" unless @job.methods.include? name.symbolize
109
+ @job_sources.add_subject(name, @job.send(name.symbolize))
110
+ end
111
+
112
+ def add_job_target(name)
113
+ raise "Unknown target #{name} for job" unless @job.methods.include? name.symbolize
114
+ @job_targets.add_subject(name, @job.send(name.symbolize))
115
+ end
116
+
117
+ def set_job_parameter(name, value)
118
+ @job.params[name.to_sym] = value
119
+ end
120
+
121
+ def add_source(name)
122
+ @sources.add_subject(name, @job.send(name.symbolize))
123
+ end
124
+
125
+ def source
126
+ @sources.only
127
+ end
128
+
129
+ def add_target(name)
130
+ @targets.add_subject(name, @job.send(name.symbolize))
131
+ end
132
+
133
+ def target
134
+ @targets.only
135
+ end
136
+
137
+
138
+ def add_example(example_name, example_table)
139
+ @examples.add_example(example_name, example_table)
140
+ end
141
+
142
+ def run_transforms
143
+ @job.run_all_transforms
144
+ end
145
+ end
146
+
147
+
148
+
149
+
150
+ class DataSubjectCollection
151
+ include Enumerable
152
+
153
+ def initialize
154
+ @subjects = {}
155
+ end
156
+
157
+ def [](subject_name)
158
+ @subjects[subject_name]
159
+ end
160
+
161
+ def each(&block)
162
+ @subjects.each &block
163
+ end
164
+
165
+ def keys
166
+ @subjects.keys
167
+ end
168
+
169
+ def add_subject(subject_name, subject)
170
+ @subjects[subject_name] ||= DataSubject.new(subject)
171
+ end
172
+
173
+ def add_field(full_field_name)
174
+ if full_field_name.include? ':'
175
+ subject_name, field_name = *Remi::BusinessRules.parse_full_field(full_field_name)
176
+ @subjects[subject_name].add_field(field_name)
177
+ else
178
+ @subjects.each do |subject_name, subject|
179
+ subject.add_field(full_field_name)
180
+ end
181
+ end
182
+ end
183
+
184
+ def only
185
+ raise "Multiple subjects defined: #{keys}" unless @subjects.size == 1
186
+ @subjects.values.first
187
+ end
188
+
189
+ def fields
190
+ dfc = DataFieldCollection.new
191
+ @subjects.each do |subject_name, subject|
192
+ subject.fields.each { |field_name, field| dfc.add_field(subject, field_name) }
193
+ end
194
+ dfc
195
+ end
196
+
197
+ def size
198
+ @subjects.size
199
+ end
200
+
201
+ def total_size
202
+ @subjects.reduce(0) { |sum, (name, subject)| sum += subject.size }
203
+ end
204
+ end
205
+
206
+
207
+ class DataSubject
208
+ def initialize(subject)
209
+ @data_obj = subject
210
+ @fields = DataFieldCollection.new
211
+
212
+ stub_data
213
+ end
214
+
215
+ attr_reader :data_obj
216
+
217
+ def add_field(field_name)
218
+ @fields.add_field(self, field_name)
219
+ end
220
+
221
+ def field
222
+ @fields.only
223
+ end
224
+
225
+ def fields
226
+ @fields
227
+ end
228
+
229
+ def size
230
+ @data_obj.df.size
231
+ end
232
+
233
+ # For debugging only
234
+ def _df
235
+ @data_obj.df
236
+ end
237
+
238
+
239
+
240
+ def stub_data
241
+ @data_obj.stub_df if @data_obj.respond_to? :stub_df
242
+ end
243
+
244
+ def stub_data_with(example)
245
+ stub_data
246
+ @data_obj.df = example.to_df(@data_obj.df.row[0].to_hash, field_symbolizer: @data_obj.field_symbolizer)
247
+ end
248
+
249
+
250
+ def replicate_rows(n_rows)
251
+ replicated_df = Daru::DataFrame.new([], order: @data_obj.df.vectors.to_a)
252
+ @data_obj.df.each do |vector|
253
+ replicated_df[vector.name] = vector.to_a * n_rows
254
+ end
255
+ @data_obj.df = replicated_df
256
+ end
257
+
258
+ def cumulative_dist_from_freq_table(table, freq_field: 'frequency')
259
+ cumulative_dist = {}
260
+ freq_total = 0
261
+ table.hashes.each do |row|
262
+ low = freq_total
263
+ high = freq_total + row[freq_field].to_f
264
+ freq_total = high
265
+ cumulative_dist[(low...high)] = row.tap { |r| r.delete(freq_field) }
266
+ end
267
+ cumulative_dist
268
+ end
269
+
270
+ def generate_values_from_cumulative_dist(n_records, cumulative_dist)
271
+ # Use the same key for reproducible tests
272
+ psuedorand = Random.new(3856382695386)
273
+
274
+ 1.upto(n_records).reduce({}) do |h, idx|
275
+ r = psuedorand.rand
276
+ row_as_hash = cumulative_dist.select { |range| range.include? r }.values.first
277
+ row_as_hash.each do |field_name, value|
278
+ h[field_name] ||= []
279
+ h[field_name] << value
280
+ end
281
+ h
282
+ end
283
+ end
284
+
285
+ def distribute_values(table)
286
+ cumulative_dist = cumulative_dist_from_freq_table(table)
287
+ generated_data = generate_values_from_cumulative_dist(@data_obj.df.size, cumulative_dist)
288
+
289
+ generated_data.each do |field_name, data_array|
290
+ vector_name = fields[field_name].name
291
+ @data_obj.df[vector_name] = Daru::Vector.new(data_array, index: @data_obj.df.index)
292
+ end
293
+ end
294
+
295
+ def freq_by(*field_names)
296
+ @data_obj.df.group_by(field_names).size * 1.0 / @data_obj.df.size
297
+ end
298
+
299
+ def mock_extractor(filestore)
300
+ extractor = class << @data_obj.extractor; self; end
301
+
302
+ extractor.send(:define_method, :all_entries, ->() { filestore.sftp_entries })
303
+ extractor.send(:define_method, :download, ->(to_download) { to_download.map { |e| e.name } })
304
+ end
305
+
306
+ def extract
307
+ @data_obj.extractor.extract
308
+ end
309
+
310
+ def csv_options
311
+ @data_obj.csv_options
312
+ end
313
+
314
+ end
315
+
316
+
317
+ class DataFieldCollection
318
+ include Enumerable
319
+
320
+ def initialize
321
+ @fields = {}
322
+ end
323
+
324
+ def [](field_name)
325
+ @fields[field_name]
326
+ end
327
+
328
+ def each(&block)
329
+ @fields.each(&block)
330
+ end
331
+
332
+ def keys
333
+ @fields.keys
334
+ end
335
+
336
+ def names
337
+ @fields.values.map(&:name)
338
+ end
339
+
340
+ def add_field(subject, field_name)
341
+ @fields[field_name] = DataField.new(subject.data_obj, field_name) unless @fields.include? field_name
342
+ end
343
+
344
+ def only
345
+ raise "Multiple subject fields defined: #{keys}" if @fields.size > 1
346
+ @fields.values.first
347
+ end
348
+
349
+ # All values get tested as strings
350
+ def values
351
+ @fields.map { |field_name, field| field.values.map(&:to_s) }.transpose
352
+ end
353
+ end
354
+
355
+
356
+ class DataField
357
+ def initialize(subject, field_name)
358
+ @subject = subject
359
+ @field_name = field_name.symbolize(subject.field_symbolizer)
360
+ end
361
+
362
+ def name
363
+ @field_name
364
+ end
365
+
366
+ def metadata
367
+ @subject.fields[name]
368
+ end
369
+
370
+ def vector
371
+ @subject.df[@field_name]
372
+ end
373
+
374
+ def value
375
+ v = vector.to_a.uniq
376
+ raise "Multiple unique values found in subject data for field #{@field_name}" if v.size > 1
377
+ v.first
378
+ end
379
+
380
+ def values
381
+ vector.to_a.map(&:to_s)
382
+ end
383
+
384
+ def value=(arg)
385
+ vector.recode! { |v| arg }
386
+ end
387
+ end
388
+
389
+
390
+ class DataExampleCollection
391
+ include Enumerable
392
+
393
+ def initialize
394
+ @examples = {}
395
+ end
396
+
397
+ def [](example_name)
398
+ @examples[example_name]
399
+ end
400
+
401
+ def each(&block)
402
+ @examples.each(&block)
403
+ end
404
+
405
+ def keys
406
+ @examples.keys
407
+ end
408
+
409
+ def add_example(example_name, example_table)
410
+ @examples[example_name] = DataExample.new(example_table) unless @examples.include? example_name
411
+ end
412
+ end
413
+
414
+
415
+ class DataExample
416
+ def initialize(table)
417
+ @table = table
418
+ end
419
+
420
+ def to_df(seed_hash, field_symbolizer:)
421
+ table_headers = @table.headers.map { |h| h.symbolize(field_symbolizer) }
422
+ df = Daru::DataFrame.new([], order: seed_hash.keys | table_headers)
423
+ @table.hashes.each do |example_row|
424
+ example_row_sym = example_row.reduce({}) do |h, (k,v)|
425
+ h[k.symbolize(field_symbolizer)] = ParseFormula.parse(v)
426
+ h
427
+ end
428
+ df.add_row(seed_hash.merge(example_row_sym))
429
+ end
430
+ df
431
+ end
432
+ end
433
+
434
+
435
+ class Filestore
436
+ def initialize
437
+ @files = []
438
+ @delivered = {}
439
+ end
440
+
441
+ attr_reader :sftp_entries
442
+
443
+ def pattern(pattern)
444
+ @pattern = pattern
445
+ end
446
+
447
+ def anti_pattern(pattern)
448
+ @pattern = /^ThisBetterNeverMatchAnythingOrIWillShootYou\d{8}Times$/
449
+ end
450
+
451
+ def delivered_since(date_time)
452
+ @delivered = { :since => date_time }
453
+ end
454
+
455
+ def delivered_before(date_time)
456
+ @delivered = { :before => date_time }
457
+ end
458
+
459
+ def latest
460
+ @files.max_by { |f| f[:attributes][:createdtime] }[:name]
461
+ end
462
+
463
+ def generate
464
+ psuedorand = Random.new(4985674985672348954987589429)
465
+
466
+ generate_files_with_pattern
467
+ @files.map! do |file|
468
+ date_method = @delivered.keys.first
469
+ if date_method == :since
470
+ file[:attributes][:createdtime] = @delivered[:since] + 10 + psuedorand.rand * 100
471
+ elsif date_method == :before
472
+ file[:attributes][:createdtime] = @delivered[:since] - 10 - psuedorand.rand * 100
473
+ else
474
+ file[:attributes][:createdtime] = Time.now - 10 - psuedorand.rand * 100
475
+ end
476
+ file
477
+ end
478
+ end
479
+
480
+ def sftp_entries
481
+ @files.map do |file|
482
+ Net::SFTP::Protocol::V04::Name.new(
483
+ file[:name],
484
+ Net::SFTP::Protocol::V04::Attributes.new(createtime: file[:attributes][:createdtime])
485
+ )
486
+ end
487
+ end
488
+
489
+ private
490
+
491
+ def generate_files_with_pattern
492
+ filenames = 1.upto(5).map { |f| @pattern.random_example }.uniq
493
+
494
+ @files = filenames.map do |fname|
495
+ {
496
+ name: fname,
497
+ attributes: {
498
+ createdtime: nil
499
+ }
500
+ }
501
+ end
502
+ end
503
+ end
504
+ end