remi 0.2.42 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. checksums.yaml +4 -4
  2. data/.yardopts +7 -0
  3. data/Gemfile +1 -1
  4. data/Gemfile.lock +13 -26
  5. data/README.md +1 -1
  6. data/features/step_definitions/remi_step.rb +33 -13
  7. data/features/sub_job_example.feature +24 -0
  8. data/features/sub_transform_example.feature +35 -0
  9. data/features/sub_transform_many_to_many.feature +49 -0
  10. data/features/support/env_app.rb +1 -1
  11. data/jobs/all_jobs_shared.rb +19 -16
  12. data/jobs/copy_source_job.rb +11 -9
  13. data/jobs/csv_file_target_job.rb +10 -9
  14. data/jobs/json_job.rb +18 -14
  15. data/jobs/metadata_job.rb +33 -28
  16. data/jobs/parameters_job.rb +14 -11
  17. data/jobs/sample_job.rb +106 -77
  18. data/jobs/sftp_file_target_job.rb +14 -13
  19. data/jobs/sub_job_example_job.rb +86 -0
  20. data/jobs/sub_transform_example_job.rb +43 -0
  21. data/jobs/sub_transform_many_to_many_job.rb +46 -0
  22. data/jobs/transforms/concatenate_job.rb +16 -12
  23. data/jobs/transforms/data_frame_sieve_job.rb +24 -19
  24. data/jobs/transforms/date_diff_job.rb +15 -11
  25. data/jobs/transforms/nvl_job.rb +16 -12
  26. data/jobs/transforms/parse_date_job.rb +17 -14
  27. data/jobs/transforms/partitioner_job.rb +27 -19
  28. data/jobs/transforms/prefix_job.rb +13 -10
  29. data/jobs/transforms/truncate_job.rb +14 -10
  30. data/jobs/transforms/truthy_job.rb +11 -8
  31. data/lib/remi.rb +25 -11
  32. data/lib/remi/data_frame.rb +4 -4
  33. data/lib/remi/data_frame/daru.rb +1 -37
  34. data/lib/remi/data_subject.rb +234 -48
  35. data/lib/remi/data_subjects/csv_file.rb +171 -0
  36. data/lib/remi/data_subjects/data_frame.rb +106 -0
  37. data/lib/remi/data_subjects/file_system.rb +115 -0
  38. data/lib/remi/data_subjects/local_file.rb +109 -0
  39. data/lib/remi/data_subjects/none.rb +31 -0
  40. data/lib/remi/data_subjects/postgres.rb +186 -0
  41. data/lib/remi/data_subjects/s3_file.rb +84 -0
  42. data/lib/remi/data_subjects/salesforce.rb +211 -0
  43. data/lib/remi/data_subjects/sftp_file.rb +196 -0
  44. data/lib/remi/data_subjects/sub_job.rb +50 -0
  45. data/lib/remi/dsl.rb +74 -0
  46. data/lib/remi/encoder.rb +45 -0
  47. data/lib/remi/extractor.rb +21 -0
  48. data/lib/remi/field_symbolizers.rb +1 -0
  49. data/lib/remi/job.rb +279 -113
  50. data/lib/remi/job/parameters.rb +90 -0
  51. data/lib/remi/job/sub_job.rb +35 -0
  52. data/lib/remi/job/transform.rb +165 -0
  53. data/lib/remi/loader.rb +22 -0
  54. data/lib/remi/monkeys/daru.rb +4 -0
  55. data/lib/remi/parser.rb +44 -0
  56. data/lib/remi/testing/business_rules.rb +17 -23
  57. data/lib/remi/testing/data_stub.rb +2 -2
  58. data/lib/remi/version.rb +1 -1
  59. data/remi.gemspec +3 -0
  60. data/spec/data_subject_spec.rb +475 -11
  61. data/spec/data_subjects/csv_file_spec.rb +69 -0
  62. data/spec/data_subjects/data_frame_spec.rb +52 -0
  63. data/spec/{extractor → data_subjects}/file_system_spec.rb +0 -0
  64. data/spec/{extractor → data_subjects}/local_file_spec.rb +0 -0
  65. data/spec/data_subjects/none_spec.rb +41 -0
  66. data/spec/data_subjects/postgres_spec.rb +80 -0
  67. data/spec/{extractor → data_subjects}/s3_file_spec.rb +0 -0
  68. data/spec/data_subjects/salesforce_spec.rb +117 -0
  69. data/spec/{extractor → data_subjects}/sftp_file_spec.rb +16 -0
  70. data/spec/data_subjects/sub_job_spec.rb +33 -0
  71. data/spec/encoder_spec.rb +38 -0
  72. data/spec/extractor_spec.rb +11 -0
  73. data/spec/fixtures/sf_bulk_helper_stubs.rb +443 -0
  74. data/spec/job/transform_spec.rb +257 -0
  75. data/spec/job_spec.rb +507 -0
  76. data/spec/loader_spec.rb +11 -0
  77. data/spec/parser_spec.rb +38 -0
  78. data/spec/sf_bulk_helper_spec.rb +117 -0
  79. data/spec/testing/data_stub_spec.rb +5 -3
  80. metadata +109 -27
  81. data/features/aggregate.feature +0 -42
  82. data/jobs/aggregate_job.rb +0 -31
  83. data/jobs/transforms/transform_jobs.rb +0 -4
  84. data/lib/remi/data_subject/csv_file.rb +0 -162
  85. data/lib/remi/data_subject/data_frame.rb +0 -52
  86. data/lib/remi/data_subject/postgres.rb +0 -134
  87. data/lib/remi/data_subject/salesforce.rb +0 -136
  88. data/lib/remi/data_subject/sftp_file.rb +0 -65
  89. data/lib/remi/extractor/file_system.rb +0 -92
  90. data/lib/remi/extractor/local_file.rb +0 -43
  91. data/lib/remi/extractor/s3_file.rb +0 -57
  92. data/lib/remi/extractor/sftp_file.rb +0 -83
  93. data/spec/data_subject/csv_file_spec.rb +0 -79
  94. data/spec/data_subject/data_frame.rb +0 -27
@@ -0,0 +1,171 @@
1
+ module Remi
2
+
3
+ # @api private
4
+ #
5
+ # Contains methods shared between CsvFile Parser/Encoder
6
+ module DataSubject::CsvFile
7
+ def self.included(base)
8
+ base.extend(CsvFileClassMethods)
9
+ end
10
+
11
+ module CsvFileClassMethods
12
+ def default_csv_options
13
+ @default_csv_options ||= CSV::DEFAULT_OPTIONS.merge({
14
+ headers: true,
15
+ header_converters: Remi::FieldSymbolizers[:standard],
16
+ converters: [],
17
+ col_sep: ',',
18
+ encoding: 'UTF-8',
19
+ quote_char: '"'
20
+ })
21
+ end
22
+ end
23
+ end
24
+
25
+ # @api public
26
+ #
27
+ # CsvFile parser
28
+ #
29
+ # @example
30
+ #
31
+ # class MyJob < Remi::Job
32
+ # source :some_file do
33
+ # extractor Remi::Extractor::LocalFile.new(
34
+ # remote_path: 'some_file.csv'
35
+ # )
36
+ # parser Remi::Parser::CsvFile.new(
37
+ # csv_options: {
38
+ # headers: true,
39
+ # col_sep: '|'
40
+ # }
41
+ # )
42
+ # end
43
+ # end
44
+ #
45
+ # job = MyJob.new
46
+ # job.some_file.df
47
+ # # =>#<Daru::DataFrame:70153153438500 @name = 4c59cfdd-7de7-4264-8666-83153f46a9e4 @size = 3>
48
+ # # id name
49
+ # # 0 1 Albert
50
+ # # 1 2 Betsy
51
+ # # 2 3 Camu
52
+ class Parser::CsvFile < Parser
53
+ include Remi::DataSubject::CsvFile
54
+
55
+ # @param csv_options [Hash] Standard Ruby CSV parsing options.
56
+ # @param filename_field [Symbol] Name of the field to be used to write
57
+ # the filename of the CSV being parsed (default: nil, meaning no field will be used)
58
+ # @param preprocessor [Proc] A proc used to pre-process lines of the CSV file before being parsed
59
+ def initialize(*args, **kargs, &block)
60
+ super
61
+ init_csv_file(*args, **kargs, &block)
62
+ end
63
+
64
+ # @return [Hash] Csv options hash
65
+ attr_reader :csv_options
66
+
67
+ # Converts a list of filenames into a dataframe after parsing them
68
+ # according ot the csv options that were set
69
+ # @param data [Object] Extracted data that needs to be parsed
70
+ # @return [Remi::DataFrame] The data converted into a dataframe
71
+ def parse(data)
72
+ # Assumes that each file has exactly the same structure
73
+ result_df = nil
74
+ Array(data).each_with_index do |filename, idx|
75
+ filename = filename.to_s
76
+
77
+ logger.info "Converting #{filename} to a dataframe"
78
+ processed_filename = preprocess(filename)
79
+ csv_df = Daru::DataFrame.from_csv processed_filename, @csv_options
80
+
81
+ csv_df[@filename_field] = Daru::Vector.new([filename] * csv_df.size, index: csv_df.index) if @filename_field
82
+ if idx == 0
83
+ result_df = csv_df
84
+ else
85
+ result_df = result_df.concat csv_df
86
+ end
87
+ end
88
+
89
+ Remi::DataFrame.create(:daru, result_df)
90
+ end
91
+
92
+
93
+ private
94
+
95
+ def preprocess(filename)
96
+ return filename unless @preprocessor
97
+ logger.info "Preprocessing #{filename}"
98
+ tmp_filename = File.join(Remi::Settings.work_dir, "#{Pathname.new(filename).basename}-#{SecureRandom.uuid}")
99
+
100
+ dirname = Pathname.new(tmp_filename).dirname
101
+ FileUtils.mkdir_p(dirname) unless File.directory? dirname
102
+
103
+ File.open(tmp_filename, 'w') do |outfile|
104
+ File.foreach(filename) do |in_line|
105
+ outfile.write @preprocessor.call(in_line)
106
+ end
107
+ end
108
+
109
+ tmp_filename
110
+ end
111
+
112
+ def init_csv_file(*args, csv_options: {}, filename_field: nil, preprocessor: nil, **kargs, &block)
113
+ @csv_options = self.class.default_csv_options.merge(csv_options)
114
+ @filename_field = filename_field
115
+ @preprocessor = preprocessor
116
+ end
117
+ end
118
+
119
+
120
+
121
+
122
+ # CsvFile Encoder
123
+ #
124
+ # @example
125
+ # class MyJob < Remi::Job
126
+ # target :my_target do
127
+ # encoder Remi::Encoder::CsvFile.new(
128
+ # csv_options: { col_sep: '|' }
129
+ # )
130
+ # loader Remi::Loader::LocalFile.new(
131
+ # path: 'test.csv'
132
+ # )
133
+ # end
134
+ # end
135
+ #
136
+ # my_df = Daru::DataFrame.new({ a: 1.upto(5).to_a, b: 6.upto(10) })
137
+ # job = MyJob.new
138
+ # job.my_target.df = my_df
139
+ # job.my_target.load
140
+ class Encoder::CsvFile < Encoder
141
+ include Remi::DataSubject::CsvFile
142
+
143
+ # @param work_path [String,Pathname] Path to a directory used to temporarily store CSV files (default: Settings.work_dir)
144
+ # @param csv_options [Hash] Standard Ruby CSV parser options.
145
+ def initialize(*args, **kargs, &block)
146
+ super
147
+ init_csv_file_encoder(*args, **kargs, &block)
148
+ end
149
+
150
+ default_csv_options[:row_sep] = "\n"
151
+
152
+ # @return [Hash] Csv options hash
153
+ attr_reader :csv_options
154
+
155
+ # Converts the dataframe to a CSV file stored in the local work directory.
156
+ #
157
+ # @param dataframe [Remi::DataFrame] The dataframe to be encoded
158
+ # @return [Object] The path to the file
159
+ def encode(dataframe)
160
+ logger.info "Writing CSV file to temporary location #{@working_file}"
161
+ dataframe.write_csv @working_file, @csv_options
162
+ @working_file
163
+ end
164
+
165
+ private
166
+ def init_csv_file_encoder(*args, work_path: Settings.work_dir, csv_options: {}, **kargs, &block)
167
+ @working_file = File.join(work_path, SecureRandom.uuid)
168
+ @csv_options = self.class.default_csv_options.merge(csv_options)
169
+ end
170
+ end
171
+ end
@@ -0,0 +1,106 @@
1
+ module Remi
2
+
3
+ # DataFrame extractor.
4
+ # This class is used to hard-code a dataframe as a simple array of rows.
5
+ #
6
+ # @example
7
+ #
8
+ # class MyJob < Remi::Job
9
+ # source :my_df do
10
+ # fields ({ id: {}, name: {}})
11
+ # extractor Remi::Extractor::DataFrame.new(
12
+ # data: [
13
+ # [1, 'Albert'],
14
+ # [2, 'Betsy'],
15
+ # [3, 'Camu']
16
+ # ]
17
+ # )
18
+ # parser Remi::Parser::DataFrame.new
19
+ # end
20
+ # end
21
+ #
22
+ # job = MyJob.new
23
+ # job.my_df.df.inspect
24
+ # # =>#<Daru::DataFrame:70153153438500 @name = 4c59cfdd-7de7-4264-8666-83153f46a9e4 @size = 3>
25
+ # # id name
26
+ # # 0 1 Albert
27
+ # # 1 2 Betsy
28
+ # # 2 3 Camu
29
+ class Extractor::DataFrame < Extractor
30
+
31
+ # @param data [Array<Array>] An array of arrays representing rows of a dataframe.
32
+ def initialize(*args, **kargs, &block)
33
+ super
34
+ init_data_frame_extractor(*args, **kargs, &block)
35
+ end
36
+
37
+ attr_accessor :data
38
+
39
+ # @return [Object] self
40
+ def extract
41
+ self
42
+ end
43
+
44
+ private
45
+
46
+ def init_data_frame_extractor(*args, data: [], **kargs, &block)
47
+ @data = data
48
+ end
49
+
50
+ end
51
+
52
+ # DataFrame parser.
53
+ # In order for the DataFrame::Extractor to be parsed correctly, fields must be defined
54
+ # on the data subject.
55
+ #
56
+ # @example
57
+ #
58
+ # class MyJob < Remi::Job
59
+ # source :my_df do
60
+ # fields ({ id: {}, name: {}})
61
+ # extractor Remi::Extractor::DataFrame.new(
62
+ # data: [
63
+ # [1, 'Albert'],
64
+ # [2, 'Betsy'],
65
+ # [3, 'Camu']
66
+ # ]
67
+ # )
68
+ # parser Remi::Parser::DataFrame.new
69
+ # end
70
+ # end
71
+ #
72
+ # job = MyJob.new
73
+ # job.my_df.df.inspect
74
+ # # =>#<Daru::DataFrame:70153153438500 @name = 4c59cfdd-7de7-4264-8666-83153f46a9e4 @size = 3>
75
+ # # id name
76
+ # # 0 1 Albert
77
+ # # 1 2 Betsy
78
+ # # 2 3 Camu
79
+ class Parser::DataFrame < Parser
80
+ # @param df_extract [Extractor::DataFrame] An object containing data extracted from memory
81
+ # @return [Remi::DataFrame] The data converted into a dataframe
82
+ def parse(df_extract)
83
+ Remi::DataFrame.create(:daru, df_extract.data.transpose, order: fields.keys)
84
+ end
85
+ end
86
+
87
+ # DataFrame encoder
88
+ class Encoder::DataFrame < Encoder
89
+ # @param data_frame [Remi::DataFrame] The data_frame to be encoded
90
+ # @return [Object] The data_frame
91
+ def encode(data_frame)
92
+ data_frame
93
+ end
94
+ end
95
+
96
+ # DataFrame loader
97
+ # Not sure this is needed, right?
98
+ # Maybe on SubJobs?
99
+ class Loader::DataFrame < Loader
100
+ # @param data [Encoder::Salesforce] Data that has been encoded appropriately to be loaded into the target
101
+ # @return [true] On success
102
+ def load(data)
103
+ true
104
+ end
105
+ end
106
+ end
@@ -0,0 +1,115 @@
1
+ module Remi
2
+
3
+ # Defines properties of an entry in a filesystem.
4
+ class Extractor::FileSystemEntry
5
+ # @param pathname [String] The path the file system entry
6
+ # @param create_time [Time] The time the entry was created
7
+ # @param modified_time [Time] The time the entry was last modified
8
+ # @param raw [Object] An object that captures all other aspects of the entry, native to system the entry lives on
9
+ def initialize(pathname:, create_time:, modified_time:, raw: nil)
10
+ @pathname = Pathname.new(pathname)
11
+ @create_time = create_time
12
+ @modified_time = modified_time
13
+ @raw = raw
14
+ end
15
+
16
+ attr_reader :pathname, :create_time, :modified_time, :raw
17
+
18
+ # @return [String] the base name of the entry
19
+ def name
20
+ @pathname.basename.to_s
21
+ end
22
+ end
23
+
24
+
25
+ # Parent class used to describe things that behave like file systems (e.g.,
26
+ # local file systems, ftp servers, S3 objects) to be used for extraction.
27
+ #
28
+ # @param remote_path [String] Path on the remote system that contains the files
29
+ # @param pattern [Regexp] Only files with a name that matches this regular
30
+ # expression are extracted
31
+ # @param local_path [String] Local path to put copies of extracted files
32
+ # @param most_recent_only [true,false] Only extract the most recent file
33
+ # that matches the given pattern
34
+ # @param group_by [Regexp] A regular expression used to group files together
35
+ # and only extract the most recent file in each group
36
+ # @param most_recent_by [Symbol] Indicates the FileSystemEntry property used to determine which
37
+ # file is the most recent(`:create_time` (default), `:modified_time`, `:name`)
38
+
39
+ class Extractor::FileSystem < Extractor
40
+ class FileNotFoundError < StandardError; end
41
+
42
+ def initialize(*args, **kargs, &block)
43
+ super
44
+ init_file_system(*args, **kargs)
45
+ end
46
+
47
+ attr_reader :remote_path
48
+ attr_reader :pattern
49
+ attr_reader :local_path
50
+ attr_reader :most_recent_only
51
+ attr_reader :group_by
52
+ attr_reader :most_recent_by
53
+
54
+ # Public: Called to extract files from the source filesystem.
55
+ #
56
+ # Returns an array with containing the paths to all files extracted.
57
+ def extract
58
+ raise NoMethodError, "#{__method__} not defined for#{self.class.name}"
59
+ end
60
+
61
+ # Public: Returns an array of all FileSystemEntry instances that are in the remote_path.
62
+ # NOTE: all_entries is responsible for matching the path using @remote_path
63
+ def all_entries
64
+ raise NoMethodError, "#{__method__} not defined for#{self.class.name}"
65
+ end
66
+
67
+ # Public: Returns just the entries that are to be extracted.
68
+ def entries
69
+ if @group_by
70
+ most_recent_matching_entry_in_group
71
+ elsif @most_recent_only
72
+ Array(most_recent_matching_entry)
73
+ else
74
+ matching_entries
75
+ end
76
+ end
77
+
78
+ def matching_entries
79
+ all_entries.select { |e| @pattern.match e.name }
80
+ end
81
+
82
+ def most_recent_matching_entry
83
+ matching_entries.sort_by { |e| e.send(@most_recent_by) }.reverse.first
84
+ end
85
+
86
+ def most_recent_matching_entry_in_group
87
+ entries_with_group = matching_entries.map do |entry|
88
+ match = entry.name.match(@group_by)
89
+ next unless match
90
+
91
+ group = match.to_a[1..-1]
92
+ { group: group, entry: entry }
93
+ end.compact
94
+ sorted_entries_with_group = entries_with_group.sort_by { |e| [e[:group], e[:entry].send(@most_recent_by)] }.reverse
95
+
96
+ last_group = nil
97
+ sorted_entries_with_group.map do |entry|
98
+ next unless entry[:group] != last_group
99
+ last_group = entry[:group]
100
+ entry[:entry]
101
+ end.compact
102
+ end
103
+
104
+ private
105
+
106
+ def init_file_system(*args, remote_path:, pattern: /.*/, local_path: Settings.work_dir, most_recent_only: false, group_by: nil, most_recent_by: :create_time, **kargs, &block)
107
+ @remote_path = Pathname.new(remote_path)
108
+ @pattern = pattern
109
+ @local_path = Pathname.new(local_path)
110
+ @most_recent_only = most_recent_only
111
+ @group_by = group_by
112
+ @most_recent_by = most_recent_by
113
+ end
114
+ end
115
+ end
@@ -0,0 +1,109 @@
1
+ module Remi
2
+
3
+ # Local file extractor
4
+ # Used to "extract" a file from a local filesystem.
5
+ # Note that even though the file is local, we still use the parameter `remote_path`
6
+ # to indicate the path. This makes this class consistent with Remi::FileSystem.
7
+ #
8
+ # @example
9
+ #
10
+ # class MyJob < Remi::Job
11
+ # source :some_file do
12
+ # extractor Remi::Extractor::LocalFile.new(
13
+ # remote_path: 'some_file.csv'
14
+ # )
15
+ # parser Remi::Parser::CsvFile.new(
16
+ # csv_options: {
17
+ # headers: true,
18
+ # col_sep: '|'
19
+ # }
20
+ # )
21
+ # end
22
+ # end
23
+ #
24
+ # job = MyJob.new
25
+ # job.some_file.df
26
+ # # =>#<Daru::DataFrame:70153153438500 @name = 4c59cfdd-7de7-4264-8666-83153f46a9e4 @size = 3>
27
+ # # id name
28
+ # # 0 1 Albert
29
+ # # 1 2 Betsy
30
+ # # 2 3 Camu
31
+ class Extractor::LocalFile < Extractor::FileSystem
32
+ def initialize(*args, **kargs)
33
+ super
34
+ init_local_file(*args, **kargs)
35
+ end
36
+
37
+ # Called to extract files from the source filesystem.
38
+ # @return [Array<String>] An array of paths to a local copy of the files extacted
39
+ def extract
40
+ entries.map(&:pathname)
41
+ end
42
+
43
+ # @return [Array<Extractor::FileSystemEntry>] List of objects in the remote path
44
+ def all_entries
45
+ @all_entries ||= all_entries!
46
+ end
47
+
48
+ # @return [Array<Extractor::FileSystemEntry>] List of objects in the remote path
49
+ def all_entries!
50
+ dir = @remote_path.directory? ? @remote_path + '*' : @remote_path
51
+ Dir[dir].map do |entry|
52
+ path = Pathname.new(entry)
53
+ if path.file?
54
+ Extractor::FileSystemEntry.new(
55
+ pathname: path.realpath.to_s,
56
+ create_time: path.ctime,
57
+ modified_time: path.mtime
58
+ )
59
+ end
60
+ end.compact
61
+ end
62
+
63
+ private
64
+
65
+ def init_local_file(*args, **kargs)
66
+ end
67
+ end
68
+
69
+
70
+ # Local file loader
71
+ # Used to output files to a local filesystem
72
+ # @example
73
+ # class MyJob < Remi::Job
74
+ # target :my_target do
75
+ # encoder Remi::Encoder::CsvFile.new(
76
+ # csv_options: { col_sep: '|' }
77
+ # )
78
+ # loader Remi::Loader::LocalFile.new(
79
+ # path: 'test.csv'
80
+ # )
81
+ # end
82
+ # end
83
+ #
84
+ # my_df = Daru::DataFrame.new({ a: 1.upto(5).to_a, b: 6.upto(10) })
85
+ # job = MyJob.new
86
+ # job.my_target.df = my_df
87
+ # job.my_target.load
88
+ class Loader::LocalFile < Loader
89
+ def initialize(*args, **kargs)
90
+ super
91
+ init_local_file_loader(*args, **kargs)
92
+ end
93
+
94
+ # Moves the file from the temporary workspace to another local path
95
+ # @param data [Object] The path to the file in the temporary work location
96
+ # @return [true] On success
97
+ def load(data)
98
+ logger.info "Writing file #{@local_path}"
99
+ FileUtils.mv(data, @local_path)
100
+ end
101
+
102
+
103
+ private
104
+
105
+ def init_local_file_loader(*args, path:, **kargs)
106
+ @local_path = path
107
+ end
108
+ end
109
+ end