remi 0.2.42 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (94) hide show
  1. checksums.yaml +4 -4
  2. data/.yardopts +7 -0
  3. data/Gemfile +1 -1
  4. data/Gemfile.lock +13 -26
  5. data/README.md +1 -1
  6. data/features/step_definitions/remi_step.rb +33 -13
  7. data/features/sub_job_example.feature +24 -0
  8. data/features/sub_transform_example.feature +35 -0
  9. data/features/sub_transform_many_to_many.feature +49 -0
  10. data/features/support/env_app.rb +1 -1
  11. data/jobs/all_jobs_shared.rb +19 -16
  12. data/jobs/copy_source_job.rb +11 -9
  13. data/jobs/csv_file_target_job.rb +10 -9
  14. data/jobs/json_job.rb +18 -14
  15. data/jobs/metadata_job.rb +33 -28
  16. data/jobs/parameters_job.rb +14 -11
  17. data/jobs/sample_job.rb +106 -77
  18. data/jobs/sftp_file_target_job.rb +14 -13
  19. data/jobs/sub_job_example_job.rb +86 -0
  20. data/jobs/sub_transform_example_job.rb +43 -0
  21. data/jobs/sub_transform_many_to_many_job.rb +46 -0
  22. data/jobs/transforms/concatenate_job.rb +16 -12
  23. data/jobs/transforms/data_frame_sieve_job.rb +24 -19
  24. data/jobs/transforms/date_diff_job.rb +15 -11
  25. data/jobs/transforms/nvl_job.rb +16 -12
  26. data/jobs/transforms/parse_date_job.rb +17 -14
  27. data/jobs/transforms/partitioner_job.rb +27 -19
  28. data/jobs/transforms/prefix_job.rb +13 -10
  29. data/jobs/transforms/truncate_job.rb +14 -10
  30. data/jobs/transforms/truthy_job.rb +11 -8
  31. data/lib/remi.rb +25 -11
  32. data/lib/remi/data_frame.rb +4 -4
  33. data/lib/remi/data_frame/daru.rb +1 -37
  34. data/lib/remi/data_subject.rb +234 -48
  35. data/lib/remi/data_subjects/csv_file.rb +171 -0
  36. data/lib/remi/data_subjects/data_frame.rb +106 -0
  37. data/lib/remi/data_subjects/file_system.rb +115 -0
  38. data/lib/remi/data_subjects/local_file.rb +109 -0
  39. data/lib/remi/data_subjects/none.rb +31 -0
  40. data/lib/remi/data_subjects/postgres.rb +186 -0
  41. data/lib/remi/data_subjects/s3_file.rb +84 -0
  42. data/lib/remi/data_subjects/salesforce.rb +211 -0
  43. data/lib/remi/data_subjects/sftp_file.rb +196 -0
  44. data/lib/remi/data_subjects/sub_job.rb +50 -0
  45. data/lib/remi/dsl.rb +74 -0
  46. data/lib/remi/encoder.rb +45 -0
  47. data/lib/remi/extractor.rb +21 -0
  48. data/lib/remi/field_symbolizers.rb +1 -0
  49. data/lib/remi/job.rb +279 -113
  50. data/lib/remi/job/parameters.rb +90 -0
  51. data/lib/remi/job/sub_job.rb +35 -0
  52. data/lib/remi/job/transform.rb +165 -0
  53. data/lib/remi/loader.rb +22 -0
  54. data/lib/remi/monkeys/daru.rb +4 -0
  55. data/lib/remi/parser.rb +44 -0
  56. data/lib/remi/testing/business_rules.rb +17 -23
  57. data/lib/remi/testing/data_stub.rb +2 -2
  58. data/lib/remi/version.rb +1 -1
  59. data/remi.gemspec +3 -0
  60. data/spec/data_subject_spec.rb +475 -11
  61. data/spec/data_subjects/csv_file_spec.rb +69 -0
  62. data/spec/data_subjects/data_frame_spec.rb +52 -0
  63. data/spec/{extractor → data_subjects}/file_system_spec.rb +0 -0
  64. data/spec/{extractor → data_subjects}/local_file_spec.rb +0 -0
  65. data/spec/data_subjects/none_spec.rb +41 -0
  66. data/spec/data_subjects/postgres_spec.rb +80 -0
  67. data/spec/{extractor → data_subjects}/s3_file_spec.rb +0 -0
  68. data/spec/data_subjects/salesforce_spec.rb +117 -0
  69. data/spec/{extractor → data_subjects}/sftp_file_spec.rb +16 -0
  70. data/spec/data_subjects/sub_job_spec.rb +33 -0
  71. data/spec/encoder_spec.rb +38 -0
  72. data/spec/extractor_spec.rb +11 -0
  73. data/spec/fixtures/sf_bulk_helper_stubs.rb +443 -0
  74. data/spec/job/transform_spec.rb +257 -0
  75. data/spec/job_spec.rb +507 -0
  76. data/spec/loader_spec.rb +11 -0
  77. data/spec/parser_spec.rb +38 -0
  78. data/spec/sf_bulk_helper_spec.rb +117 -0
  79. data/spec/testing/data_stub_spec.rb +5 -3
  80. metadata +109 -27
  81. data/features/aggregate.feature +0 -42
  82. data/jobs/aggregate_job.rb +0 -31
  83. data/jobs/transforms/transform_jobs.rb +0 -4
  84. data/lib/remi/data_subject/csv_file.rb +0 -162
  85. data/lib/remi/data_subject/data_frame.rb +0 -52
  86. data/lib/remi/data_subject/postgres.rb +0 -134
  87. data/lib/remi/data_subject/salesforce.rb +0 -136
  88. data/lib/remi/data_subject/sftp_file.rb +0 -65
  89. data/lib/remi/extractor/file_system.rb +0 -92
  90. data/lib/remi/extractor/local_file.rb +0 -43
  91. data/lib/remi/extractor/s3_file.rb +0 -57
  92. data/lib/remi/extractor/sftp_file.rb +0 -83
  93. data/spec/data_subject/csv_file_spec.rb +0 -79
  94. data/spec/data_subject/data_frame.rb +0 -27
@@ -0,0 +1,171 @@
1
+ module Remi
2
+
3
+ # @api private
4
+ #
5
+ # Contains methods shared between CsvFile Parser/Encoder
6
+ module DataSubject::CsvFile
7
+ def self.included(base)
8
+ base.extend(CsvFileClassMethods)
9
+ end
10
+
11
+ module CsvFileClassMethods
12
+ def default_csv_options
13
+ @default_csv_options ||= CSV::DEFAULT_OPTIONS.merge({
14
+ headers: true,
15
+ header_converters: Remi::FieldSymbolizers[:standard],
16
+ converters: [],
17
+ col_sep: ',',
18
+ encoding: 'UTF-8',
19
+ quote_char: '"'
20
+ })
21
+ end
22
+ end
23
+ end
24
+
25
+ # @api public
26
+ #
27
+ # CsvFile parser
28
+ #
29
+ # @example
30
+ #
31
+ # class MyJob < Remi::Job
32
+ # source :some_file do
33
+ # extractor Remi::Extractor::LocalFile.new(
34
+ # remote_path: 'some_file.csv'
35
+ # )
36
+ # parser Remi::Parser::CsvFile.new(
37
+ # csv_options: {
38
+ # headers: true,
39
+ # col_sep: '|'
40
+ # }
41
+ # )
42
+ # end
43
+ # end
44
+ #
45
+ # job = MyJob.new
46
+ # job.some_file.df
47
+ # # =>#<Daru::DataFrame:70153153438500 @name = 4c59cfdd-7de7-4264-8666-83153f46a9e4 @size = 3>
48
+ # # id name
49
+ # # 0 1 Albert
50
+ # # 1 2 Betsy
51
+ # # 2 3 Camu
52
+ class Parser::CsvFile < Parser
53
+ include Remi::DataSubject::CsvFile
54
+
55
+ # @param csv_options [Hash] Standard Ruby CSV parsing options.
56
+ # @param filename_field [Symbol] Name of the field to be used to write
57
+ # the filename of the CSV being parsed (default: nil, meaning no field will be used)
58
+ # @param preprocessor [Proc] A proc used to pre-process lines of the CSV file before being parsed
59
+ def initialize(*args, **kargs, &block)
60
+ super
61
+ init_csv_file(*args, **kargs, &block)
62
+ end
63
+
64
+ # @return [Hash] Csv options hash
65
+ attr_reader :csv_options
66
+
67
+ # Converts a list of filenames into a dataframe after parsing them
68
+ # according ot the csv options that were set
69
+ # @param data [Object] Extracted data that needs to be parsed
70
+ # @return [Remi::DataFrame] The data converted into a dataframe
71
+ def parse(data)
72
+ # Assumes that each file has exactly the same structure
73
+ result_df = nil
74
+ Array(data).each_with_index do |filename, idx|
75
+ filename = filename.to_s
76
+
77
+ logger.info "Converting #{filename} to a dataframe"
78
+ processed_filename = preprocess(filename)
79
+ csv_df = Daru::DataFrame.from_csv processed_filename, @csv_options
80
+
81
+ csv_df[@filename_field] = Daru::Vector.new([filename] * csv_df.size, index: csv_df.index) if @filename_field
82
+ if idx == 0
83
+ result_df = csv_df
84
+ else
85
+ result_df = result_df.concat csv_df
86
+ end
87
+ end
88
+
89
+ Remi::DataFrame.create(:daru, result_df)
90
+ end
91
+
92
+
93
+ private
94
+
95
+ def preprocess(filename)
96
+ return filename unless @preprocessor
97
+ logger.info "Preprocessing #{filename}"
98
+ tmp_filename = File.join(Remi::Settings.work_dir, "#{Pathname.new(filename).basename}-#{SecureRandom.uuid}")
99
+
100
+ dirname = Pathname.new(tmp_filename).dirname
101
+ FileUtils.mkdir_p(dirname) unless File.directory? dirname
102
+
103
+ File.open(tmp_filename, 'w') do |outfile|
104
+ File.foreach(filename) do |in_line|
105
+ outfile.write @preprocessor.call(in_line)
106
+ end
107
+ end
108
+
109
+ tmp_filename
110
+ end
111
+
112
+ def init_csv_file(*args, csv_options: {}, filename_field: nil, preprocessor: nil, **kargs, &block)
113
+ @csv_options = self.class.default_csv_options.merge(csv_options)
114
+ @filename_field = filename_field
115
+ @preprocessor = preprocessor
116
+ end
117
+ end
118
+
119
+
120
+
121
+
122
+ # CsvFile Encoder
123
+ #
124
+ # @example
125
+ # class MyJob < Remi::Job
126
+ # target :my_target do
127
+ # encoder Remi::Encoder::CsvFile.new(
128
+ # csv_options: { col_sep: '|' }
129
+ # )
130
+ # loader Remi::Loader::LocalFile.new(
131
+ # path: 'test.csv'
132
+ # )
133
+ # end
134
+ # end
135
+ #
136
+ # my_df = Daru::DataFrame.new({ a: 1.upto(5).to_a, b: 6.upto(10) })
137
+ # job = MyJob.new
138
+ # job.my_target.df = my_df
139
+ # job.my_target.load
140
+ class Encoder::CsvFile < Encoder
141
+ include Remi::DataSubject::CsvFile
142
+
143
+ # @param work_path [String,Pathname] Path to a directory used to temporarily store CSV files (default: Settings.work_dir)
144
+ # @param csv_options [Hash] Standard Ruby CSV parser options.
145
+ def initialize(*args, **kargs, &block)
146
+ super
147
+ init_csv_file_encoder(*args, **kargs, &block)
148
+ end
149
+
150
+ default_csv_options[:row_sep] = "\n"
151
+
152
+ # @return [Hash] Csv options hash
153
+ attr_reader :csv_options
154
+
155
+ # Converts the dataframe to a CSV file stored in the local work directory.
156
+ #
157
+ # @param dataframe [Remi::DataFrame] The dataframe to be encoded
158
+ # @return [Object] The path to the file
159
+ def encode(dataframe)
160
+ logger.info "Writing CSV file to temporary location #{@working_file}"
161
+ dataframe.write_csv @working_file, @csv_options
162
+ @working_file
163
+ end
164
+
165
+ private
166
+ def init_csv_file_encoder(*args, work_path: Settings.work_dir, csv_options: {}, **kargs, &block)
167
+ @working_file = File.join(work_path, SecureRandom.uuid)
168
+ @csv_options = self.class.default_csv_options.merge(csv_options)
169
+ end
170
+ end
171
+ end
@@ -0,0 +1,106 @@
1
+ module Remi
2
+
3
+ # DataFrame extractor.
4
+ # This class is used to hard-code a dataframe as a simple array of rows.
5
+ #
6
+ # @example
7
+ #
8
+ # class MyJob < Remi::Job
9
+ # source :my_df do
10
+ # fields ({ id: {}, name: {}})
11
+ # extractor Remi::Extractor::DataFrame.new(
12
+ # data: [
13
+ # [1, 'Albert'],
14
+ # [2, 'Betsy'],
15
+ # [3, 'Camu']
16
+ # ]
17
+ # )
18
+ # parser Remi::Parser::DataFrame.new
19
+ # end
20
+ # end
21
+ #
22
+ # job = MyJob.new
23
+ # job.my_df.df.inspect
24
+ # # =>#<Daru::DataFrame:70153153438500 @name = 4c59cfdd-7de7-4264-8666-83153f46a9e4 @size = 3>
25
+ # # id name
26
+ # # 0 1 Albert
27
+ # # 1 2 Betsy
28
+ # # 2 3 Camu
29
+ class Extractor::DataFrame < Extractor
30
+
31
+ # @param data [Array<Array>] An array of arrays representing rows of a dataframe.
32
+ def initialize(*args, **kargs, &block)
33
+ super
34
+ init_data_frame_extractor(*args, **kargs, &block)
35
+ end
36
+
37
+ attr_accessor :data
38
+
39
+ # @return [Object] self
40
+ def extract
41
+ self
42
+ end
43
+
44
+ private
45
+
46
+ def init_data_frame_extractor(*args, data: [], **kargs, &block)
47
+ @data = data
48
+ end
49
+
50
+ end
51
+
52
+ # DataFrame parser.
53
+ # In order for the DataFrame::Extractor to be parsed correctly, fields must be defined
54
+ # on the data subject.
55
+ #
56
+ # @example
57
+ #
58
+ # class MyJob < Remi::Job
59
+ # source :my_df do
60
+ # fields ({ id: {}, name: {}})
61
+ # extractor Remi::Extractor::DataFrame.new(
62
+ # data: [
63
+ # [1, 'Albert'],
64
+ # [2, 'Betsy'],
65
+ # [3, 'Camu']
66
+ # ]
67
+ # )
68
+ # parser Remi::Parser::DataFrame.new
69
+ # end
70
+ # end
71
+ #
72
+ # job = MyJob.new
73
+ # job.my_df.df.inspect
74
+ # # =>#<Daru::DataFrame:70153153438500 @name = 4c59cfdd-7de7-4264-8666-83153f46a9e4 @size = 3>
75
+ # # id name
76
+ # # 0 1 Albert
77
+ # # 1 2 Betsy
78
+ # # 2 3 Camu
79
+ class Parser::DataFrame < Parser
80
+ # @param df_extract [Extractor::DataFrame] An object containing data extracted from memory
81
+ # @return [Remi::DataFrame] The data converted into a dataframe
82
+ def parse(df_extract)
83
+ Remi::DataFrame.create(:daru, df_extract.data.transpose, order: fields.keys)
84
+ end
85
+ end
86
+
87
+ # DataFrame encoder
88
+ class Encoder::DataFrame < Encoder
89
+ # @param data_frame [Remi::DataFrame] The data_frame to be encoded
90
+ # @return [Object] The data_frame
91
+ def encode(data_frame)
92
+ data_frame
93
+ end
94
+ end
95
+
96
+ # DataFrame loader
97
+ # Not sure this is needed, right?
98
+ # Maybe on SubJobs?
99
+ class Loader::DataFrame < Loader
100
+ # @param data [Encoder::Salesforce] Data that has been encoded appropriately to be loaded into the target
101
+ # @return [true] On success
102
+ def load(data)
103
+ true
104
+ end
105
+ end
106
+ end
@@ -0,0 +1,115 @@
1
+ module Remi
2
+
3
+ # Defines properties of an entry in a filesystem.
4
+ class Extractor::FileSystemEntry
5
+ # @param pathname [String] The path the file system entry
6
+ # @param create_time [Time] The time the entry was created
7
+ # @param modified_time [Time] The time the entry was last modified
8
+ # @param raw [Object] An object that captures all other aspects of the entry, native to system the entry lives on
9
+ def initialize(pathname:, create_time:, modified_time:, raw: nil)
10
+ @pathname = Pathname.new(pathname)
11
+ @create_time = create_time
12
+ @modified_time = modified_time
13
+ @raw = raw
14
+ end
15
+
16
+ attr_reader :pathname, :create_time, :modified_time, :raw
17
+
18
+ # @return [String] the base name of the entry
19
+ def name
20
+ @pathname.basename.to_s
21
+ end
22
+ end
23
+
24
+
25
+ # Parent class used to describe things that behave like file systems (e.g.,
26
+ # local file systems, ftp servers, S3 objects) to be used for extraction.
27
+ #
28
+ # @param remote_path [String] Path on the remote system that contains the files
29
+ # @param pattern [Regexp] Only files with a name that matches this regular
30
+ # expression are extracted
31
+ # @param local_path [String] Local path to put copies of extracted files
32
+ # @param most_recent_only [true,false] Only extract the most recent file
33
+ # that matches the given pattern
34
+ # @param group_by [Regexp] A regular expression used to group files together
35
+ # and only extract the most recent file in each group
36
+ # @param most_recent_by [Symbol] Indicates the FileSystemEntry property used to determine which
37
+ # file is the most recent(`:create_time` (default), `:modified_time`, `:name`)
38
+
39
+ class Extractor::FileSystem < Extractor
40
+ class FileNotFoundError < StandardError; end
41
+
42
+ def initialize(*args, **kargs, &block)
43
+ super
44
+ init_file_system(*args, **kargs)
45
+ end
46
+
47
+ attr_reader :remote_path
48
+ attr_reader :pattern
49
+ attr_reader :local_path
50
+ attr_reader :most_recent_only
51
+ attr_reader :group_by
52
+ attr_reader :most_recent_by
53
+
54
+ # Public: Called to extract files from the source filesystem.
55
+ #
56
+ # Returns an array with containing the paths to all files extracted.
57
+ def extract
58
+ raise NoMethodError, "#{__method__} not defined for#{self.class.name}"
59
+ end
60
+
61
+ # Public: Returns an array of all FileSystemEntry instances that are in the remote_path.
62
+ # NOTE: all_entries is responsible for matching the path using @remote_path
63
+ def all_entries
64
+ raise NoMethodError, "#{__method__} not defined for#{self.class.name}"
65
+ end
66
+
67
+ # Public: Returns just the entries that are to be extracted.
68
+ def entries
69
+ if @group_by
70
+ most_recent_matching_entry_in_group
71
+ elsif @most_recent_only
72
+ Array(most_recent_matching_entry)
73
+ else
74
+ matching_entries
75
+ end
76
+ end
77
+
78
+ def matching_entries
79
+ all_entries.select { |e| @pattern.match e.name }
80
+ end
81
+
82
+ def most_recent_matching_entry
83
+ matching_entries.sort_by { |e| e.send(@most_recent_by) }.reverse.first
84
+ end
85
+
86
+ def most_recent_matching_entry_in_group
87
+ entries_with_group = matching_entries.map do |entry|
88
+ match = entry.name.match(@group_by)
89
+ next unless match
90
+
91
+ group = match.to_a[1..-1]
92
+ { group: group, entry: entry }
93
+ end.compact
94
+ sorted_entries_with_group = entries_with_group.sort_by { |e| [e[:group], e[:entry].send(@most_recent_by)] }.reverse
95
+
96
+ last_group = nil
97
+ sorted_entries_with_group.map do |entry|
98
+ next unless entry[:group] != last_group
99
+ last_group = entry[:group]
100
+ entry[:entry]
101
+ end.compact
102
+ end
103
+
104
+ private
105
+
106
+ def init_file_system(*args, remote_path:, pattern: /.*/, local_path: Settings.work_dir, most_recent_only: false, group_by: nil, most_recent_by: :create_time, **kargs, &block)
107
+ @remote_path = Pathname.new(remote_path)
108
+ @pattern = pattern
109
+ @local_path = Pathname.new(local_path)
110
+ @most_recent_only = most_recent_only
111
+ @group_by = group_by
112
+ @most_recent_by = most_recent_by
113
+ end
114
+ end
115
+ end
@@ -0,0 +1,109 @@
1
+ module Remi
2
+
3
+ # Local file extractor
4
+ # Used to "extract" a file from a local filesystem.
5
+ # Note that even though the file is local, we still use the parameter `remote_path`
6
+ # to indicate the path. This makes this class consistent with Remi::FileSystem.
7
+ #
8
+ # @example
9
+ #
10
+ # class MyJob < Remi::Job
11
+ # source :some_file do
12
+ # extractor Remi::Extractor::LocalFile.new(
13
+ # remote_path: 'some_file.csv'
14
+ # )
15
+ # parser Remi::Parser::CsvFile.new(
16
+ # csv_options: {
17
+ # headers: true,
18
+ # col_sep: '|'
19
+ # }
20
+ # )
21
+ # end
22
+ # end
23
+ #
24
+ # job = MyJob.new
25
+ # job.some_file.df
26
+ # # =>#<Daru::DataFrame:70153153438500 @name = 4c59cfdd-7de7-4264-8666-83153f46a9e4 @size = 3>
27
+ # # id name
28
+ # # 0 1 Albert
29
+ # # 1 2 Betsy
30
+ # # 2 3 Camu
31
+ class Extractor::LocalFile < Extractor::FileSystem
32
+ def initialize(*args, **kargs)
33
+ super
34
+ init_local_file(*args, **kargs)
35
+ end
36
+
37
+ # Called to extract files from the source filesystem.
38
+ # @return [Array<String>] An array of paths to a local copy of the files extacted
39
+ def extract
40
+ entries.map(&:pathname)
41
+ end
42
+
43
+ # @return [Array<Extractor::FileSystemEntry>] List of objects in the remote path
44
+ def all_entries
45
+ @all_entries ||= all_entries!
46
+ end
47
+
48
+ # @return [Array<Extractor::FileSystemEntry>] List of objects in the remote path
49
+ def all_entries!
50
+ dir = @remote_path.directory? ? @remote_path + '*' : @remote_path
51
+ Dir[dir].map do |entry|
52
+ path = Pathname.new(entry)
53
+ if path.file?
54
+ Extractor::FileSystemEntry.new(
55
+ pathname: path.realpath.to_s,
56
+ create_time: path.ctime,
57
+ modified_time: path.mtime
58
+ )
59
+ end
60
+ end.compact
61
+ end
62
+
63
+ private
64
+
65
+ def init_local_file(*args, **kargs)
66
+ end
67
+ end
68
+
69
+
70
+ # Local file loader
71
+ # Used to output files to a local filesystem
72
+ # @example
73
+ # class MyJob < Remi::Job
74
+ # target :my_target do
75
+ # encoder Remi::Encoder::CsvFile.new(
76
+ # csv_options: { col_sep: '|' }
77
+ # )
78
+ # loader Remi::Loader::LocalFile.new(
79
+ # path: 'test.csv'
80
+ # )
81
+ # end
82
+ # end
83
+ #
84
+ # my_df = Daru::DataFrame.new({ a: 1.upto(5).to_a, b: 6.upto(10) })
85
+ # job = MyJob.new
86
+ # job.my_target.df = my_df
87
+ # job.my_target.load
88
+ class Loader::LocalFile < Loader
89
+ def initialize(*args, **kargs)
90
+ super
91
+ init_local_file_loader(*args, **kargs)
92
+ end
93
+
94
+ # Moves the file from the temporary workspace to another local path
95
+ # @param data [Object] The path to the file in the temporary work location
96
+ # @return [true] On success
97
+ def load(data)
98
+ logger.info "Writing file #{@local_path}"
99
+ FileUtils.mv(data, @local_path)
100
+ end
101
+
102
+
103
+ private
104
+
105
+ def init_local_file_loader(*args, path:, **kargs)
106
+ @local_path = path
107
+ end
108
+ end
109
+ end