remi 0.2.27 → 0.2.28

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +3 -0
  3. data/Gemfile.lock +34 -5
  4. data/features/metadata.feature +17 -0
  5. data/features/step_definitions/remi_step.rb +6 -6
  6. data/features/transforms/date_diff.feature +1 -0
  7. data/jobs/aggregate_job.rb +0 -1
  8. data/jobs/all_jobs_shared.rb +0 -2
  9. data/jobs/copy_source_job.rb +0 -1
  10. data/jobs/csv_file_target_job.rb +0 -1
  11. data/jobs/metadata_job.rb +60 -0
  12. data/jobs/parameters_job.rb +1 -1
  13. data/jobs/sample_job.rb +19 -20
  14. data/jobs/sftp_file_target_job.rb +0 -1
  15. data/jobs/transforms/date_diff_job.rb +1 -1
  16. data/jobs/transforms/nvl_job.rb +1 -1
  17. data/jobs/transforms/parse_date_job.rb +7 -4
  18. data/jobs/transforms/prefix_job.rb +1 -1
  19. data/jobs/transforms/truncate_job.rb +1 -1
  20. data/lib/remi.rb +10 -15
  21. data/lib/remi/cucumber/business_rules.rb +23 -23
  22. data/lib/remi/cucumber/data_source.rb +2 -1
  23. data/lib/remi/data_frame.rb +36 -0
  24. data/lib/remi/data_frame/daru.rb +67 -0
  25. data/lib/remi/data_subject.rb +71 -10
  26. data/lib/remi/data_subject/csv_file.rb +151 -0
  27. data/lib/remi/data_subject/data_frame.rb +53 -0
  28. data/lib/remi/data_subject/postgres.rb +136 -0
  29. data/lib/remi/data_subject/salesforce.rb +136 -0
  30. data/lib/remi/data_subject/sftp_file.rb +66 -0
  31. data/lib/remi/fields.rb +8 -0
  32. data/lib/remi/source_to_target_map.rb +56 -32
  33. data/lib/remi/transform.rb +426 -83
  34. data/lib/remi/version.rb +1 -1
  35. data/remi.gemspec +2 -1
  36. data/spec/metadata_spec.rb +62 -0
  37. metadata +15 -28
  38. data/lib/remi/data_source.rb +0 -13
  39. data/lib/remi/data_source/csv_file.rb +0 -101
  40. data/lib/remi/data_source/data_frame.rb +0 -16
  41. data/lib/remi/data_source/postgres.rb +0 -58
  42. data/lib/remi/data_source/salesforce.rb +0 -87
  43. data/lib/remi/data_target.rb +0 -15
  44. data/lib/remi/data_target/csv_file.rb +0 -42
  45. data/lib/remi/data_target/data_frame.rb +0 -14
  46. data/lib/remi/data_target/postgres.rb +0 -74
  47. data/lib/remi/data_target/salesforce.rb +0 -54
  48. data/lib/remi/data_target/sftp_file.rb +0 -54
  49. data/lib/remi/refinements/daru.rb +0 -85
@@ -13,7 +13,7 @@ class TruncateJob
13
13
  define_transform :main, sources: :source_data, targets: :target_data do
14
14
  Remi::SourceToTargetMap.apply(source_data.df, target_data.df) do
15
15
  map source(:my_field) .target(:truncated_field)
16
- .transform(Remi::Transform[:truncate].(params[:truncate_len].to_i))
16
+ .transform(Remi::Transform::Truncate.new(params[:truncate_len].to_i))
17
17
  end
18
18
  end
19
19
  end
@@ -39,25 +39,20 @@ require 'remi/settings'
39
39
  require 'remi/job'
40
40
  require 'remi/source_to_target_map'
41
41
  require 'remi/field_symbolizers'
42
- require 'remi/data_subject'
43
- require 'remi/sf_bulk_helper' # separate into SF support package
44
42
 
45
43
  require 'remi/refinements/symbolizer'
46
- require 'remi/refinements/daru'
47
44
 
48
45
  require 'remi/extractor/sftp_file'
49
46
 
50
- require 'remi/data_source.rb'
51
- require 'remi/data_source/data_frame'
52
- require 'remi/data_source/csv_file'
53
- require 'remi/data_source/salesforce'
54
- require 'remi/data_source/postgres'
55
-
56
- require 'remi/data_target.rb'
57
- require 'remi/data_target/data_frame'
58
- require 'remi/data_target/salesforce'
59
- require 'remi/data_target/csv_file'
60
- require 'remi/data_target/sftp_file'
61
- require 'remi/data_target/postgres'
47
+ require 'remi/fields'
48
+ require 'remi/data_frame'
49
+ require 'remi/data_frame/daru'
50
+
51
+ require 'remi/data_subject'
52
+ require 'remi/data_subject/csv_file'
53
+ #require 'remi/data_subject/salesforce' # intentionally not included by default
54
+ require 'remi/data_subject/postgres'
55
+ require 'remi/data_subject/sftp_file'
56
+ require 'remi/data_subject/data_frame'
62
57
 
63
58
  require 'remi/transform'
@@ -250,14 +250,14 @@ module Remi::BusinessRules
250
250
  class DataSubject
251
251
  def initialize(name, subject)
252
252
  @name = name
253
- @data_obj = subject
253
+ @data_subject = subject
254
254
  @fields = DataFieldCollection.new
255
255
 
256
256
  stub_data
257
257
  end
258
258
 
259
259
  attr_reader :name
260
- attr_reader :data_obj
260
+ attr_reader :data_subject
261
261
 
262
262
  def add_field(field_name)
263
263
  @fields.add_field(self, field_name)
@@ -272,17 +272,17 @@ module Remi::BusinessRules
272
272
  end
273
273
 
274
274
  def size
275
- @data_obj.df.size
275
+ @data_subject.df.size
276
276
  end
277
277
 
278
278
  def get_attrib(name)
279
- @data_obj.send(name)
279
+ @data_subject.send(name)
280
280
  end
281
281
 
282
282
  # Public: Converts the data subject to a hash where the keys are the table
283
283
  # columns and the values are an array for the value of column for each row.
284
284
  def column_hash
285
- @data_obj.df.to_hash.reduce({}) do |h, (k,v)|
285
+ @data_subject.df.to_h.reduce({}) do |h, (k,v)|
286
286
  h[k.symbolize] = v.to_a
287
287
  h
288
288
  end
@@ -290,7 +290,7 @@ module Remi::BusinessRules
290
290
 
291
291
  # For debugging only
292
292
  def _df
293
- @data_obj.df
293
+ @data_subject.df
294
294
  end
295
295
 
296
296
 
@@ -298,7 +298,7 @@ module Remi::BusinessRules
298
298
  # Need more robust duping to make that feasible.
299
299
  # Don't use results for anything more than size.
300
300
  def where(field_name, operation)
301
- @data_obj.df.where(@data_obj.df[field_name.symbolize(@data_obj.field_symbolizer)].recode { |v| operation.call(v) })
301
+ @data_subject.df.where(@data_subject.df[field_name.symbolize(@data_subject.field_symbolizer)].recode { |v| operation.call(v) })
302
302
  end
303
303
 
304
304
  def where_is(field_name, value)
@@ -324,29 +324,29 @@ module Remi::BusinessRules
324
324
 
325
325
 
326
326
  def stub_data
327
- @data_obj.stub_df if @data_obj.respond_to? :stub_df
327
+ @data_subject.stub_df if @data_subject.respond_to? :stub_df
328
328
  end
329
329
 
330
330
  def example_to_df(example)
331
- example.to_df(@data_obj.df.row[0].to_hash, field_symbolizer: @data_obj.field_symbolizer)
331
+ example.to_df(@data_subject.df.row[0].to_h, field_symbolizer: @data_subject.field_symbolizer)
332
332
  end
333
333
 
334
334
  def stub_data_with(example)
335
335
  stub_data
336
- @data_obj.df = example_to_df(example)
336
+ @data_subject.df = example_to_df(example)
337
337
  end
338
338
 
339
339
  def append_data_with(example)
340
- @data_obj.df = @data_obj.df.concat example_to_df(example)
340
+ @data_subject.df = @data_subject.df.concat example_to_df(example)
341
341
  end
342
342
 
343
343
 
344
344
  def replicate_rows(n_rows)
345
- replicated_df = Daru::DataFrame.new([], order: @data_obj.df.vectors.to_a)
346
- @data_obj.df.each do |vector|
345
+ replicated_df = Daru::DataFrame.new([], order: @data_subject.df.vectors.to_a)
346
+ @data_subject.df.each do |vector|
347
347
  replicated_df[vector.name] = vector.to_a * n_rows
348
348
  end
349
- @data_obj.df = replicated_df
349
+ @data_subject.df = replicated_df
350
350
  end
351
351
 
352
352
  def cumulative_dist_from_freq_table(table, freq_field: 'frequency')
@@ -378,31 +378,31 @@ module Remi::BusinessRules
378
378
 
379
379
  def distribute_values(table)
380
380
  cumulative_dist = cumulative_dist_from_freq_table(table)
381
- generated_data = generate_values_from_cumulative_dist(@data_obj.df.size, cumulative_dist)
381
+ generated_data = generate_values_from_cumulative_dist(@data_subject.df.size, cumulative_dist)
382
382
 
383
383
  generated_data.each do |field_name, data_array|
384
384
  vector_name = fields[field_name].field_name
385
- @data_obj.df[vector_name] = Daru::Vector.new(data_array, index: @data_obj.df.index)
385
+ @data_subject.df[vector_name] = Daru::Vector.new(data_array, index: @data_subject.df.index)
386
386
  end
387
387
  end
388
388
 
389
389
  def freq_by(*field_names)
390
- @data_obj.df.group_by(field_names).size * 1.0 / @data_obj.df.size
390
+ @data_subject.df.group_by(field_names).size * 1.0 / @data_subject.df.size
391
391
  end
392
392
 
393
393
  def mock_extractor(filestore)
394
- extractor = class << @data_obj.extractor; self; end
394
+ extractor = class << @data_subject.extractor; self; end
395
395
 
396
396
  extractor.send(:define_method, :all_entries, ->() { filestore.sftp_entries })
397
397
  extractor.send(:define_method, :download, ->(to_download) { to_download.map { |e| e.name } })
398
398
  end
399
399
 
400
400
  def extract
401
- @data_obj.extractor.extract
401
+ @data_subject.extractor.extract
402
402
  end
403
403
 
404
404
  def csv_options
405
- @data_obj.csv_options
405
+ @data_subject.csv_options
406
406
  end
407
407
 
408
408
  end
@@ -456,7 +456,7 @@ module Remi::BusinessRules
456
456
  def initialize(subject, name)
457
457
  @subject = subject
458
458
  @name = name
459
- @field_name = name.symbolize(subject.data_obj.field_symbolizer)
459
+ @field_name = name.symbolize(subject.data_subject.field_symbolizer)
460
460
  end
461
461
 
462
462
  attr_reader :name
@@ -468,11 +468,11 @@ module Remi::BusinessRules
468
468
  end
469
469
 
470
470
  def metadata
471
- @subject.data_obj.fields[@field_name]
471
+ @subject.data_subject.fields[@field_name]
472
472
  end
473
473
 
474
474
  def vector
475
- @subject.data_obj.df[@field_name]
475
+ @subject.data_subject.df[@field_name]
476
476
  end
477
477
 
478
478
  def value
@@ -54,7 +54,8 @@ module Remi
54
54
  end
55
55
  end
56
56
 
57
- class Salesforce
57
+ # Hmmm.... this gets called first because I'm trying to split SF off as a "plugin"
58
+ class Salesforce < Remi::DataSubject
58
59
  include DataStub
59
60
  end
60
61
 
@@ -0,0 +1,36 @@
1
+ module Remi
2
+ module DataFrame
3
+ class << self
4
+ def create(remi_df_type = :daru, *args, **kargs, &block)
5
+ dataframe = case remi_df_type
6
+ when :daru
7
+ Remi::DataFrame::Daru.new(*args, **kargs, &block)
8
+ else
9
+ raise TypeError, "Unknown frame type: #{remi_df_type}"
10
+ end
11
+ end
12
+
13
+ def daru(*args, **kargs, &block)
14
+ self.create(:daru, *args, **kargs, &block)
15
+ end
16
+ end
17
+
18
+
19
+ def [](*args)
20
+ super
21
+ end
22
+
23
+ def size
24
+ super
25
+ end
26
+
27
+ def write_csv(*args, **kargs, &block)
28
+ super
29
+ end
30
+
31
+ # Public: Returns the type of DataFrame
32
+ def remi_df_type
33
+ raise NoMethodError, "#{__method__} not defined for #{self.class.name}"
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,67 @@
1
+ module Remi
2
+ module DataFrame
3
+ class Daru < SimpleDelegator
4
+ include Remi::DataFrame
5
+
6
+ def initialize(*args, **kargs, &block)
7
+ if args[0].is_a? ::Daru::DataFrame
8
+ super(args[0])
9
+ else
10
+ super(::Daru::DataFrame.new(*args, **kargs, &block))
11
+ end
12
+ end
13
+
14
+
15
+ # Public: Returns the type of DataFrame
16
+ def remi_df_type
17
+ :daru
18
+ end
19
+
20
+ # Public: Saves a Dataframe to a file.
21
+ def hash_dump(filename)
22
+ File.binwrite(filename, Marshal.dump(self))
23
+ end
24
+
25
+ # Public: Creates a DataFrame by reading the dumped version from a file.
26
+ def self.from_hash_dump(filename)
27
+ Marshal.load(File.binread(filename))
28
+ end
29
+
30
+ # Public: Allows the user to define an arbitrary aggregation function.
31
+ #
32
+ # by - The name of the DataFrame vector to use to group records.
33
+ # func - A lambda function that accepts three arguments - the
34
+ # first argument is the DataFrame, the second is the
35
+ # key to the current group, and the third is the index
36
+ # of the elements belonging to a group.
37
+ #
38
+ # Example:
39
+ # df = Remi::DataFrame::Daru.new( { a: ['a','a','a','b','b'], year: ['2018','2015','2019', '2014', '2013'] })
40
+ #
41
+ # mymin = lambda do |vector, df, group_key, indices|
42
+ # values = indices.map { |idx| df.row[idx][vector] }
43
+ # "Group #{group_key} has a minimum value of #{values.min}"
44
+ # end
45
+ #
46
+ # df.aggregate(by: :a, func: mymin.curry.(:year))
47
+ #
48
+ #
49
+ # Returns a Daru::Vector.
50
+ def aggregate(by:, func:)
51
+ grouped = self.group_by(by)
52
+ df_indices = self.index.to_a
53
+ ::Daru::Vector.new(
54
+ grouped.groups.reduce({}) do |h, (key, indices)|
55
+ # Daru groups don't use the index of the dataframe when returning groups (WTF?).
56
+ # Instead they return the position of the record in the dataframe. Here, we
57
+ group_df_indices = indices.map { |v| df_indices[v] }
58
+ group_key = key.size == 1 ? key.first : key
59
+ h[group_key] = func.(self, group_key, group_df_indices)
60
+ h
61
+ end
62
+ )
63
+ end
64
+
65
+ end
66
+ end
67
+ end
@@ -1,24 +1,85 @@
1
1
  module Remi
2
- module DataSubject
2
+
3
+ # Namespaces for specific sources/targets
4
+ module DataSource; end
5
+ module DataTarget; end
6
+
7
+ class DataSubject
8
+ def initialize(*args, fields: Remi::Fields.new, remi_df_type: :daru, logger: Remi::Settings.logger, **kargs, &block)
9
+ @fields = fields
10
+ @remi_df_type = remi_df_type
11
+ @logger = logger
12
+ end
13
+
14
+ attr_accessor :fields
15
+
3
16
  def field_symbolizer
4
17
  Remi::FieldSymbolizers[:standard]
5
18
  end
6
19
 
7
20
  def df
8
- @dataframe ||= Daru::DataFrame.new([])
21
+ @dataframe ||= Remi::DataFrame.create(@remi_df_type, [], order: @fields.keys)
9
22
  end
10
23
 
11
24
  def df=(new_dataframe)
12
- @dataframe = new_dataframe
25
+ if new_dataframe.respond_to? :remi_df_type
26
+ @dataframe = new_dataframe
27
+ else
28
+ @dataframe = Remi::DataFrame.create(@remi_df_type, new_dataframe)
29
+ end
13
30
  end
14
31
 
15
- # Fields is a hash where the keys are the data field names and the values
16
- # are a hash of metadata. DataFrames do not currently support metadata,
17
- # so the metdata will be empty unless overridden by the specific target.
18
- def fields
19
- df.vectors.to_a.reduce({}) do |h, v|
20
- h[v] = {}
21
- h
32
+ module DataSource
33
+
34
+ # Public: Access the dataframe from a DataSource
35
+ #
36
+ # Returns a Remi::DataFrame
37
+ def df
38
+ @dataframe ||= to_dataframe
39
+ end
40
+
41
+ # Public: Memoized version of extract!
42
+ def extract
43
+ @extract ||= extract!
44
+ end
45
+
46
+ # Public: Called to extract data from the source.
47
+ #
48
+ # Returns data in a format that can be used to create a dataframe.
49
+ def extract!
50
+ raise NoMethodError, "#{__method__} not defined for #{self.class.name}"
51
+ @extract
52
+ end
53
+
54
+ # Public: Converts extracted data to a dataframe
55
+ #
56
+ # Returns a Remi::DataFrame
57
+ def to_dataframe
58
+ raise NoMethodError, "#{__method__} not defined for #{self.class.name}"
59
+ end
60
+ end
61
+
62
+ module DataTarget
63
+
64
+ # Public: Loads data to the target. This is automatically called
65
+ # after all transforms have executed, but could also get called manually.
66
+ # The actual load operation is only executed if hasn't already.
67
+ #
68
+ # Returns true if the load operation was successful.
69
+ def load
70
+ return true if @loaded || df.size == 0
71
+
72
+ @loaded = load!
73
+ end
74
+
75
+ # Public: Performs the load operation, regardless of whether it has
76
+ # already executed.
77
+ #
78
+ # Returns true if the load operation was successful
79
+ def load!
80
+ raise NoMethodError, "#{__method__} not defined for #{self.class.name}"
81
+
82
+ false
22
83
  end
23
84
  end
24
85
  end
@@ -0,0 +1,151 @@
1
+ module Remi
2
+ module DataSubject::CsvFile
3
+ def self.included(base)
4
+ base.extend(CsvFileClassMethods)
5
+ end
6
+
7
+ def field_symbolizer
8
+ self.class.default_csv_options[:header_converters]
9
+ end
10
+
11
+ module CsvFileClassMethods
12
+ def default_csv_options
13
+ @default_csv_options ||= CSV::DEFAULT_OPTIONS.merge({
14
+ headers: true,
15
+ header_converters: Remi::FieldSymbolizers[:standard],
16
+ converters: [],
17
+ col_sep: ',',
18
+ encoding: 'UTF-8',
19
+ quote_char: '"'
20
+ })
21
+ end
22
+ end
23
+ end
24
+
25
+
26
+
27
+
28
+
29
+ class DataSource::CsvFile < Remi::DataSubject
30
+ include Remi::DataSubject::DataSource
31
+ include Remi::DataSubject::CsvFile
32
+
33
+ def initialize(*args, **kargs, &block)
34
+ super
35
+ init_csv_file(*args, **kargs, &block)
36
+ end
37
+
38
+ attr_reader :extractor
39
+ attr_reader :csv_options
40
+
41
+ # Public: Called to extract data from the source.
42
+ #
43
+ # Returns data in a format that can be used to create a dataframe.
44
+ def extract!
45
+ @extract = Array(@extractor.extract)
46
+ end
47
+
48
+ # Public: Converts extracted data to a dataframe.
49
+ # Currently only supports Daru DataFrames.
50
+ #
51
+ # Returns a Remi::DataFrame
52
+ def to_dataframe
53
+ # Assumes that each file has exactly the same structure
54
+ result_df = nil
55
+ extract.each_with_index do |filename, idx|
56
+ @logger.info "Converting #{filename} to a dataframe"
57
+ csv_df = Daru::DataFrame.from_csv filename, @csv_options
58
+
59
+ csv_df[@filename_field] = Daru::Vector.new([filename] * csv_df.size, index: csv_df.index) if @filename_field
60
+ if idx == 0
61
+ result_df = csv_df
62
+ else
63
+ result_df = result_df.concat csv_df
64
+ end
65
+ end
66
+
67
+ Remi::DataFrame.create(:daru, result_df)
68
+ end
69
+
70
+
71
+
72
+ def extractor=(arg)
73
+ case arg
74
+ when Extractor::SftpFile, Extractor::LocalFile
75
+ @extractor = arg
76
+ when String
77
+ @extractor = Extractor::LocalFile.new(path: arg)
78
+ when Regexp
79
+ raise "Adding regex matching to local files would be easy, not done yet"
80
+ else
81
+ raise "Unknown extractor of type #{arg.class}: #{arg}"
82
+ end
83
+ end
84
+
85
+ # Only going to support single file for now
86
+ def source_filename
87
+ raise "Multiple source files detected" if extract.size > 1
88
+ @source_filename ||= extract.first
89
+ end
90
+
91
+ def first_line
92
+ # Readline assumes \n line endings. Strip out \r if it is a DOS file.
93
+ @first_line ||= File.open(source_filename) do |f|
94
+ f.readline.gsub(/\r/,'')
95
+ end
96
+ end
97
+
98
+ def headers
99
+ @headers ||= CSV.open(source_filename, 'r', source_csv_options) { |csv| csv.first }.headers
100
+ end
101
+
102
+ def valid_headers?
103
+ (fields.keys - headers).empty?
104
+ end
105
+
106
+
107
+ private
108
+
109
+ def init_csv_file(*args, extractor:, csv_options: {}, filename_field: nil, **kargs, &block)
110
+ self.extractor = extractor
111
+ @csv_options = self.class.default_csv_options.merge(csv_options)
112
+ @filename_field = filename_field
113
+ end
114
+ end
115
+
116
+
117
+
118
+
119
+
120
+ class DataTarget::CsvFile < Remi::DataSubject
121
+ include ::Remi::DataSubject::DataTarget
122
+ include ::Remi::DataSubject::CsvFile
123
+
124
+ default_csv_options[:row_sep] = "\n"
125
+
126
+ def initialize(*args, **kargs, &block)
127
+ super
128
+ init_csv_file(*args, **kargs, &block)
129
+ end
130
+
131
+ attr_reader :csv_options
132
+
133
+ # Public: Performs the load operation, regardless of whether it has
134
+ # already executed.
135
+ #
136
+ # Returns true if the load operation was successful
137
+ def load!
138
+ @logger.info "Writing CSV file #{@path}"
139
+ df.write_csv @path, @csv_options
140
+ true
141
+ end
142
+
143
+
144
+ private
145
+
146
+ def init_csv_file(*args, path:, csv_options: {}, **kargs, &block)
147
+ @path = path
148
+ @csv_options = self.class.default_csv_options.merge(csv_options)
149
+ end
150
+ end
151
+ end