remi 0.2.27 → 0.2.28

Sign up to get free protection for your applications and to get access to all the features.
Files changed (49) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +3 -0
  3. data/Gemfile.lock +34 -5
  4. data/features/metadata.feature +17 -0
  5. data/features/step_definitions/remi_step.rb +6 -6
  6. data/features/transforms/date_diff.feature +1 -0
  7. data/jobs/aggregate_job.rb +0 -1
  8. data/jobs/all_jobs_shared.rb +0 -2
  9. data/jobs/copy_source_job.rb +0 -1
  10. data/jobs/csv_file_target_job.rb +0 -1
  11. data/jobs/metadata_job.rb +60 -0
  12. data/jobs/parameters_job.rb +1 -1
  13. data/jobs/sample_job.rb +19 -20
  14. data/jobs/sftp_file_target_job.rb +0 -1
  15. data/jobs/transforms/date_diff_job.rb +1 -1
  16. data/jobs/transforms/nvl_job.rb +1 -1
  17. data/jobs/transforms/parse_date_job.rb +7 -4
  18. data/jobs/transforms/prefix_job.rb +1 -1
  19. data/jobs/transforms/truncate_job.rb +1 -1
  20. data/lib/remi.rb +10 -15
  21. data/lib/remi/cucumber/business_rules.rb +23 -23
  22. data/lib/remi/cucumber/data_source.rb +2 -1
  23. data/lib/remi/data_frame.rb +36 -0
  24. data/lib/remi/data_frame/daru.rb +67 -0
  25. data/lib/remi/data_subject.rb +71 -10
  26. data/lib/remi/data_subject/csv_file.rb +151 -0
  27. data/lib/remi/data_subject/data_frame.rb +53 -0
  28. data/lib/remi/data_subject/postgres.rb +136 -0
  29. data/lib/remi/data_subject/salesforce.rb +136 -0
  30. data/lib/remi/data_subject/sftp_file.rb +66 -0
  31. data/lib/remi/fields.rb +8 -0
  32. data/lib/remi/source_to_target_map.rb +56 -32
  33. data/lib/remi/transform.rb +426 -83
  34. data/lib/remi/version.rb +1 -1
  35. data/remi.gemspec +2 -1
  36. data/spec/metadata_spec.rb +62 -0
  37. metadata +15 -28
  38. data/lib/remi/data_source.rb +0 -13
  39. data/lib/remi/data_source/csv_file.rb +0 -101
  40. data/lib/remi/data_source/data_frame.rb +0 -16
  41. data/lib/remi/data_source/postgres.rb +0 -58
  42. data/lib/remi/data_source/salesforce.rb +0 -87
  43. data/lib/remi/data_target.rb +0 -15
  44. data/lib/remi/data_target/csv_file.rb +0 -42
  45. data/lib/remi/data_target/data_frame.rb +0 -14
  46. data/lib/remi/data_target/postgres.rb +0 -74
  47. data/lib/remi/data_target/salesforce.rb +0 -54
  48. data/lib/remi/data_target/sftp_file.rb +0 -54
  49. data/lib/remi/refinements/daru.rb +0 -85
@@ -13,7 +13,7 @@ class TruncateJob
13
13
  define_transform :main, sources: :source_data, targets: :target_data do
14
14
  Remi::SourceToTargetMap.apply(source_data.df, target_data.df) do
15
15
  map source(:my_field) .target(:truncated_field)
16
- .transform(Remi::Transform[:truncate].(params[:truncate_len].to_i))
16
+ .transform(Remi::Transform::Truncate.new(params[:truncate_len].to_i))
17
17
  end
18
18
  end
19
19
  end
@@ -39,25 +39,20 @@ require 'remi/settings'
39
39
  require 'remi/job'
40
40
  require 'remi/source_to_target_map'
41
41
  require 'remi/field_symbolizers'
42
- require 'remi/data_subject'
43
- require 'remi/sf_bulk_helper' # separate into SF support package
44
42
 
45
43
  require 'remi/refinements/symbolizer'
46
- require 'remi/refinements/daru'
47
44
 
48
45
  require 'remi/extractor/sftp_file'
49
46
 
50
- require 'remi/data_source.rb'
51
- require 'remi/data_source/data_frame'
52
- require 'remi/data_source/csv_file'
53
- require 'remi/data_source/salesforce'
54
- require 'remi/data_source/postgres'
55
-
56
- require 'remi/data_target.rb'
57
- require 'remi/data_target/data_frame'
58
- require 'remi/data_target/salesforce'
59
- require 'remi/data_target/csv_file'
60
- require 'remi/data_target/sftp_file'
61
- require 'remi/data_target/postgres'
47
+ require 'remi/fields'
48
+ require 'remi/data_frame'
49
+ require 'remi/data_frame/daru'
50
+
51
+ require 'remi/data_subject'
52
+ require 'remi/data_subject/csv_file'
53
+ #require 'remi/data_subject/salesforce' # intentionally not included by default
54
+ require 'remi/data_subject/postgres'
55
+ require 'remi/data_subject/sftp_file'
56
+ require 'remi/data_subject/data_frame'
62
57
 
63
58
  require 'remi/transform'
@@ -250,14 +250,14 @@ module Remi::BusinessRules
250
250
  class DataSubject
251
251
  def initialize(name, subject)
252
252
  @name = name
253
- @data_obj = subject
253
+ @data_subject = subject
254
254
  @fields = DataFieldCollection.new
255
255
 
256
256
  stub_data
257
257
  end
258
258
 
259
259
  attr_reader :name
260
- attr_reader :data_obj
260
+ attr_reader :data_subject
261
261
 
262
262
  def add_field(field_name)
263
263
  @fields.add_field(self, field_name)
@@ -272,17 +272,17 @@ module Remi::BusinessRules
272
272
  end
273
273
 
274
274
  def size
275
- @data_obj.df.size
275
+ @data_subject.df.size
276
276
  end
277
277
 
278
278
  def get_attrib(name)
279
- @data_obj.send(name)
279
+ @data_subject.send(name)
280
280
  end
281
281
 
282
282
  # Public: Converts the data subject to a hash where the keys are the table
283
283
  # columns and the values are an array for the value of column for each row.
284
284
  def column_hash
285
- @data_obj.df.to_hash.reduce({}) do |h, (k,v)|
285
+ @data_subject.df.to_h.reduce({}) do |h, (k,v)|
286
286
  h[k.symbolize] = v.to_a
287
287
  h
288
288
  end
@@ -290,7 +290,7 @@ module Remi::BusinessRules
290
290
 
291
291
  # For debugging only
292
292
  def _df
293
- @data_obj.df
293
+ @data_subject.df
294
294
  end
295
295
 
296
296
 
@@ -298,7 +298,7 @@ module Remi::BusinessRules
298
298
  # Need more robust duping to make that feasible.
299
299
  # Don't use results for anything more than size.
300
300
  def where(field_name, operation)
301
- @data_obj.df.where(@data_obj.df[field_name.symbolize(@data_obj.field_symbolizer)].recode { |v| operation.call(v) })
301
+ @data_subject.df.where(@data_subject.df[field_name.symbolize(@data_subject.field_symbolizer)].recode { |v| operation.call(v) })
302
302
  end
303
303
 
304
304
  def where_is(field_name, value)
@@ -324,29 +324,29 @@ module Remi::BusinessRules
324
324
 
325
325
 
326
326
  def stub_data
327
- @data_obj.stub_df if @data_obj.respond_to? :stub_df
327
+ @data_subject.stub_df if @data_subject.respond_to? :stub_df
328
328
  end
329
329
 
330
330
  def example_to_df(example)
331
- example.to_df(@data_obj.df.row[0].to_hash, field_symbolizer: @data_obj.field_symbolizer)
331
+ example.to_df(@data_subject.df.row[0].to_h, field_symbolizer: @data_subject.field_symbolizer)
332
332
  end
333
333
 
334
334
  def stub_data_with(example)
335
335
  stub_data
336
- @data_obj.df = example_to_df(example)
336
+ @data_subject.df = example_to_df(example)
337
337
  end
338
338
 
339
339
  def append_data_with(example)
340
- @data_obj.df = @data_obj.df.concat example_to_df(example)
340
+ @data_subject.df = @data_subject.df.concat example_to_df(example)
341
341
  end
342
342
 
343
343
 
344
344
  def replicate_rows(n_rows)
345
- replicated_df = Daru::DataFrame.new([], order: @data_obj.df.vectors.to_a)
346
- @data_obj.df.each do |vector|
345
+ replicated_df = Daru::DataFrame.new([], order: @data_subject.df.vectors.to_a)
346
+ @data_subject.df.each do |vector|
347
347
  replicated_df[vector.name] = vector.to_a * n_rows
348
348
  end
349
- @data_obj.df = replicated_df
349
+ @data_subject.df = replicated_df
350
350
  end
351
351
 
352
352
  def cumulative_dist_from_freq_table(table, freq_field: 'frequency')
@@ -378,31 +378,31 @@ module Remi::BusinessRules
378
378
 
379
379
  def distribute_values(table)
380
380
  cumulative_dist = cumulative_dist_from_freq_table(table)
381
- generated_data = generate_values_from_cumulative_dist(@data_obj.df.size, cumulative_dist)
381
+ generated_data = generate_values_from_cumulative_dist(@data_subject.df.size, cumulative_dist)
382
382
 
383
383
  generated_data.each do |field_name, data_array|
384
384
  vector_name = fields[field_name].field_name
385
- @data_obj.df[vector_name] = Daru::Vector.new(data_array, index: @data_obj.df.index)
385
+ @data_subject.df[vector_name] = Daru::Vector.new(data_array, index: @data_subject.df.index)
386
386
  end
387
387
  end
388
388
 
389
389
  def freq_by(*field_names)
390
- @data_obj.df.group_by(field_names).size * 1.0 / @data_obj.df.size
390
+ @data_subject.df.group_by(field_names).size * 1.0 / @data_subject.df.size
391
391
  end
392
392
 
393
393
  def mock_extractor(filestore)
394
- extractor = class << @data_obj.extractor; self; end
394
+ extractor = class << @data_subject.extractor; self; end
395
395
 
396
396
  extractor.send(:define_method, :all_entries, ->() { filestore.sftp_entries })
397
397
  extractor.send(:define_method, :download, ->(to_download) { to_download.map { |e| e.name } })
398
398
  end
399
399
 
400
400
  def extract
401
- @data_obj.extractor.extract
401
+ @data_subject.extractor.extract
402
402
  end
403
403
 
404
404
  def csv_options
405
- @data_obj.csv_options
405
+ @data_subject.csv_options
406
406
  end
407
407
 
408
408
  end
@@ -456,7 +456,7 @@ module Remi::BusinessRules
456
456
  def initialize(subject, name)
457
457
  @subject = subject
458
458
  @name = name
459
- @field_name = name.symbolize(subject.data_obj.field_symbolizer)
459
+ @field_name = name.symbolize(subject.data_subject.field_symbolizer)
460
460
  end
461
461
 
462
462
  attr_reader :name
@@ -468,11 +468,11 @@ module Remi::BusinessRules
468
468
  end
469
469
 
470
470
  def metadata
471
- @subject.data_obj.fields[@field_name]
471
+ @subject.data_subject.fields[@field_name]
472
472
  end
473
473
 
474
474
  def vector
475
- @subject.data_obj.df[@field_name]
475
+ @subject.data_subject.df[@field_name]
476
476
  end
477
477
 
478
478
  def value
@@ -54,7 +54,8 @@ module Remi
54
54
  end
55
55
  end
56
56
 
57
- class Salesforce
57
+ # Hmmm.... this gets called first because I'm trying to split SF off as a "plugin"
58
+ class Salesforce < Remi::DataSubject
58
59
  include DataStub
59
60
  end
60
61
 
@@ -0,0 +1,36 @@
1
+ module Remi
2
+ module DataFrame
3
+ class << self
4
+ def create(remi_df_type = :daru, *args, **kargs, &block)
5
+ dataframe = case remi_df_type
6
+ when :daru
7
+ Remi::DataFrame::Daru.new(*args, **kargs, &block)
8
+ else
9
+ raise TypeError, "Unknown frame type: #{remi_df_type}"
10
+ end
11
+ end
12
+
13
+ def daru(*args, **kargs, &block)
14
+ self.create(:daru, *args, **kargs, &block)
15
+ end
16
+ end
17
+
18
+
19
+ def [](*args)
20
+ super
21
+ end
22
+
23
+ def size
24
+ super
25
+ end
26
+
27
+ def write_csv(*args, **kargs, &block)
28
+ super
29
+ end
30
+
31
+ # Public: Returns the type of DataFrame
32
+ def remi_df_type
33
+ raise NoMethodError, "#{__method__} not defined for #{self.class.name}"
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,67 @@
1
+ module Remi
2
+ module DataFrame
3
+ class Daru < SimpleDelegator
4
+ include Remi::DataFrame
5
+
6
+ def initialize(*args, **kargs, &block)
7
+ if args[0].is_a? ::Daru::DataFrame
8
+ super(args[0])
9
+ else
10
+ super(::Daru::DataFrame.new(*args, **kargs, &block))
11
+ end
12
+ end
13
+
14
+
15
+ # Public: Returns the type of DataFrame
16
+ def remi_df_type
17
+ :daru
18
+ end
19
+
20
+ # Public: Saves a Dataframe to a file.
21
+ def hash_dump(filename)
22
+ File.binwrite(filename, Marshal.dump(self))
23
+ end
24
+
25
+ # Public: Creates a DataFrame by reading the dumped version from a file.
26
+ def self.from_hash_dump(filename)
27
+ Marshal.load(File.binread(filename))
28
+ end
29
+
30
+ # Public: Allows the user to define an arbitrary aggregation function.
31
+ #
32
+ # by - The name of the DataFrame vector to use to group records.
33
+ # func - A lambda function that accepts three arguments - the
34
+ # first argument is the DataFrame, the second is the
35
+ # key to the current group, and the third is the index
36
+ # of the elements belonging to a group.
37
+ #
38
+ # Example:
39
+ # df = Remi::DataFrame::Daru.new( { a: ['a','a','a','b','b'], year: ['2018','2015','2019', '2014', '2013'] })
40
+ #
41
+ # mymin = lambda do |vector, df, group_key, indices|
42
+ # values = indices.map { |idx| df.row[idx][vector] }
43
+ # "Group #{group_key} has a minimum value of #{values.min}"
44
+ # end
45
+ #
46
+ # df.aggregate(by: :a, func: mymin.curry.(:year))
47
+ #
48
+ #
49
+ # Returns a Daru::Vector.
50
+ def aggregate(by:, func:)
51
+ grouped = self.group_by(by)
52
+ df_indices = self.index.to_a
53
+ ::Daru::Vector.new(
54
+ grouped.groups.reduce({}) do |h, (key, indices)|
55
+ # Daru groups don't use the index of the dataframe when returning groups (WTF?).
56
+ # Instead they return the position of the record in the dataframe. Here, we
57
+ group_df_indices = indices.map { |v| df_indices[v] }
58
+ group_key = key.size == 1 ? key.first : key
59
+ h[group_key] = func.(self, group_key, group_df_indices)
60
+ h
61
+ end
62
+ )
63
+ end
64
+
65
+ end
66
+ end
67
+ end
@@ -1,24 +1,85 @@
1
1
  module Remi
2
- module DataSubject
2
+
3
+ # Namespaces for specific sources/targets
4
+ module DataSource; end
5
+ module DataTarget; end
6
+
7
+ class DataSubject
8
+ def initialize(*args, fields: Remi::Fields.new, remi_df_type: :daru, logger: Remi::Settings.logger, **kargs, &block)
9
+ @fields = fields
10
+ @remi_df_type = remi_df_type
11
+ @logger = logger
12
+ end
13
+
14
+ attr_accessor :fields
15
+
3
16
  def field_symbolizer
4
17
  Remi::FieldSymbolizers[:standard]
5
18
  end
6
19
 
7
20
  def df
8
- @dataframe ||= Daru::DataFrame.new([])
21
+ @dataframe ||= Remi::DataFrame.create(@remi_df_type, [], order: @fields.keys)
9
22
  end
10
23
 
11
24
  def df=(new_dataframe)
12
- @dataframe = new_dataframe
25
+ if new_dataframe.respond_to? :remi_df_type
26
+ @dataframe = new_dataframe
27
+ else
28
+ @dataframe = Remi::DataFrame.create(@remi_df_type, new_dataframe)
29
+ end
13
30
  end
14
31
 
15
- # Fields is a hash where the keys are the data field names and the values
16
- # are a hash of metadata. DataFrames do not currently support metadata,
17
- # so the metdata will be empty unless overridden by the specific target.
18
- def fields
19
- df.vectors.to_a.reduce({}) do |h, v|
20
- h[v] = {}
21
- h
32
+ module DataSource
33
+
34
+ # Public: Access the dataframe from a DataSource
35
+ #
36
+ # Returns a Remi::DataFrame
37
+ def df
38
+ @dataframe ||= to_dataframe
39
+ end
40
+
41
+ # Public: Memoized version of extract!
42
+ def extract
43
+ @extract ||= extract!
44
+ end
45
+
46
+ # Public: Called to extract data from the source.
47
+ #
48
+ # Returns data in a format that can be used to create a dataframe.
49
+ def extract!
50
+ raise NoMethodError, "#{__method__} not defined for #{self.class.name}"
51
+ @extract
52
+ end
53
+
54
+ # Public: Converts extracted data to a dataframe
55
+ #
56
+ # Returns a Remi::DataFrame
57
+ def to_dataframe
58
+ raise NoMethodError, "#{__method__} not defined for #{self.class.name}"
59
+ end
60
+ end
61
+
62
+ module DataTarget
63
+
64
+ # Public: Loads data to the target. This is automatically called
65
+ # after all transforms have executed, but could also get called manually.
66
+ # The actual load operation is only executed if hasn't already.
67
+ #
68
+ # Returns true if the load operation was successful.
69
+ def load
70
+ return true if @loaded || df.size == 0
71
+
72
+ @loaded = load!
73
+ end
74
+
75
+ # Public: Performs the load operation, regardless of whether it has
76
+ # already executed.
77
+ #
78
+ # Returns true if the load operation was successful
79
+ def load!
80
+ raise NoMethodError, "#{__method__} not defined for #{self.class.name}"
81
+
82
+ false
22
83
  end
23
84
  end
24
85
  end
@@ -0,0 +1,151 @@
1
+ module Remi
2
+ module DataSubject::CsvFile
3
+ def self.included(base)
4
+ base.extend(CsvFileClassMethods)
5
+ end
6
+
7
+ def field_symbolizer
8
+ self.class.default_csv_options[:header_converters]
9
+ end
10
+
11
+ module CsvFileClassMethods
12
+ def default_csv_options
13
+ @default_csv_options ||= CSV::DEFAULT_OPTIONS.merge({
14
+ headers: true,
15
+ header_converters: Remi::FieldSymbolizers[:standard],
16
+ converters: [],
17
+ col_sep: ',',
18
+ encoding: 'UTF-8',
19
+ quote_char: '"'
20
+ })
21
+ end
22
+ end
23
+ end
24
+
25
+
26
+
27
+
28
+
29
+ class DataSource::CsvFile < Remi::DataSubject
30
+ include Remi::DataSubject::DataSource
31
+ include Remi::DataSubject::CsvFile
32
+
33
+ def initialize(*args, **kargs, &block)
34
+ super
35
+ init_csv_file(*args, **kargs, &block)
36
+ end
37
+
38
+ attr_reader :extractor
39
+ attr_reader :csv_options
40
+
41
+ # Public: Called to extract data from the source.
42
+ #
43
+ # Returns data in a format that can be used to create a dataframe.
44
+ def extract!
45
+ @extract = Array(@extractor.extract)
46
+ end
47
+
48
+ # Public: Converts extracted data to a dataframe.
49
+ # Currently only supports Daru DataFrames.
50
+ #
51
+ # Returns a Remi::DataFrame
52
+ def to_dataframe
53
+ # Assumes that each file has exactly the same structure
54
+ result_df = nil
55
+ extract.each_with_index do |filename, idx|
56
+ @logger.info "Converting #{filename} to a dataframe"
57
+ csv_df = Daru::DataFrame.from_csv filename, @csv_options
58
+
59
+ csv_df[@filename_field] = Daru::Vector.new([filename] * csv_df.size, index: csv_df.index) if @filename_field
60
+ if idx == 0
61
+ result_df = csv_df
62
+ else
63
+ result_df = result_df.concat csv_df
64
+ end
65
+ end
66
+
67
+ Remi::DataFrame.create(:daru, result_df)
68
+ end
69
+
70
+
71
+
72
+ def extractor=(arg)
73
+ case arg
74
+ when Extractor::SftpFile, Extractor::LocalFile
75
+ @extractor = arg
76
+ when String
77
+ @extractor = Extractor::LocalFile.new(path: arg)
78
+ when Regexp
79
+ raise "Adding regex matching to local files would be easy, not done yet"
80
+ else
81
+ raise "Unknown extractor of type #{arg.class}: #{arg}"
82
+ end
83
+ end
84
+
85
+ # Only going to support single file for now
86
+ def source_filename
87
+ raise "Multiple source files detected" if extract.size > 1
88
+ @source_filename ||= extract.first
89
+ end
90
+
91
+ def first_line
92
+ # Readline assumes \n line endings. Strip out \r if it is a DOS file.
93
+ @first_line ||= File.open(source_filename) do |f|
94
+ f.readline.gsub(/\r/,'')
95
+ end
96
+ end
97
+
98
+ def headers
99
+ @headers ||= CSV.open(source_filename, 'r', source_csv_options) { |csv| csv.first }.headers
100
+ end
101
+
102
+ def valid_headers?
103
+ (fields.keys - headers).empty?
104
+ end
105
+
106
+
107
+ private
108
+
109
+ def init_csv_file(*args, extractor:, csv_options: {}, filename_field: nil, **kargs, &block)
110
+ self.extractor = extractor
111
+ @csv_options = self.class.default_csv_options.merge(csv_options)
112
+ @filename_field = filename_field
113
+ end
114
+ end
115
+
116
+
117
+
118
+
119
+
120
+ class DataTarget::CsvFile < Remi::DataSubject
121
+ include ::Remi::DataSubject::DataTarget
122
+ include ::Remi::DataSubject::CsvFile
123
+
124
+ default_csv_options[:row_sep] = "\n"
125
+
126
+ def initialize(*args, **kargs, &block)
127
+ super
128
+ init_csv_file(*args, **kargs, &block)
129
+ end
130
+
131
+ attr_reader :csv_options
132
+
133
+ # Public: Performs the load operation, regardless of whether it has
134
+ # already executed.
135
+ #
136
+ # Returns true if the load operation was successful
137
+ def load!
138
+ @logger.info "Writing CSV file #{@path}"
139
+ df.write_csv @path, @csv_options
140
+ true
141
+ end
142
+
143
+
144
+ private
145
+
146
+ def init_csv_file(*args, path:, csv_options: {}, **kargs, &block)
147
+ @path = path
148
+ @csv_options = self.class.default_csv_options.merge(csv_options)
149
+ end
150
+ end
151
+ end