remi 0.2.42 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. checksums.yaml +4 -4
  2. data/.yardopts +7 -0
  3. data/Gemfile +1 -1
  4. data/Gemfile.lock +13 -26
  5. data/README.md +1 -1
  6. data/features/step_definitions/remi_step.rb +33 -13
  7. data/features/sub_job_example.feature +24 -0
  8. data/features/sub_transform_example.feature +35 -0
  9. data/features/sub_transform_many_to_many.feature +49 -0
  10. data/features/support/env_app.rb +1 -1
  11. data/jobs/all_jobs_shared.rb +19 -16
  12. data/jobs/copy_source_job.rb +11 -9
  13. data/jobs/csv_file_target_job.rb +10 -9
  14. data/jobs/json_job.rb +18 -14
  15. data/jobs/metadata_job.rb +33 -28
  16. data/jobs/parameters_job.rb +14 -11
  17. data/jobs/sample_job.rb +106 -77
  18. data/jobs/sftp_file_target_job.rb +14 -13
  19. data/jobs/sub_job_example_job.rb +86 -0
  20. data/jobs/sub_transform_example_job.rb +43 -0
  21. data/jobs/sub_transform_many_to_many_job.rb +46 -0
  22. data/jobs/transforms/concatenate_job.rb +16 -12
  23. data/jobs/transforms/data_frame_sieve_job.rb +24 -19
  24. data/jobs/transforms/date_diff_job.rb +15 -11
  25. data/jobs/transforms/nvl_job.rb +16 -12
  26. data/jobs/transforms/parse_date_job.rb +17 -14
  27. data/jobs/transforms/partitioner_job.rb +27 -19
  28. data/jobs/transforms/prefix_job.rb +13 -10
  29. data/jobs/transforms/truncate_job.rb +14 -10
  30. data/jobs/transforms/truthy_job.rb +11 -8
  31. data/lib/remi.rb +25 -11
  32. data/lib/remi/data_frame.rb +4 -4
  33. data/lib/remi/data_frame/daru.rb +1 -37
  34. data/lib/remi/data_subject.rb +234 -48
  35. data/lib/remi/data_subjects/csv_file.rb +171 -0
  36. data/lib/remi/data_subjects/data_frame.rb +106 -0
  37. data/lib/remi/data_subjects/file_system.rb +115 -0
  38. data/lib/remi/data_subjects/local_file.rb +109 -0
  39. data/lib/remi/data_subjects/none.rb +31 -0
  40. data/lib/remi/data_subjects/postgres.rb +186 -0
  41. data/lib/remi/data_subjects/s3_file.rb +84 -0
  42. data/lib/remi/data_subjects/salesforce.rb +211 -0
  43. data/lib/remi/data_subjects/sftp_file.rb +196 -0
  44. data/lib/remi/data_subjects/sub_job.rb +50 -0
  45. data/lib/remi/dsl.rb +74 -0
  46. data/lib/remi/encoder.rb +45 -0
  47. data/lib/remi/extractor.rb +21 -0
  48. data/lib/remi/field_symbolizers.rb +1 -0
  49. data/lib/remi/job.rb +279 -113
  50. data/lib/remi/job/parameters.rb +90 -0
  51. data/lib/remi/job/sub_job.rb +35 -0
  52. data/lib/remi/job/transform.rb +165 -0
  53. data/lib/remi/loader.rb +22 -0
  54. data/lib/remi/monkeys/daru.rb +4 -0
  55. data/lib/remi/parser.rb +44 -0
  56. data/lib/remi/testing/business_rules.rb +17 -23
  57. data/lib/remi/testing/data_stub.rb +2 -2
  58. data/lib/remi/version.rb +1 -1
  59. data/remi.gemspec +3 -0
  60. data/spec/data_subject_spec.rb +475 -11
  61. data/spec/data_subjects/csv_file_spec.rb +69 -0
  62. data/spec/data_subjects/data_frame_spec.rb +52 -0
  63. data/spec/{extractor → data_subjects}/file_system_spec.rb +0 -0
  64. data/spec/{extractor → data_subjects}/local_file_spec.rb +0 -0
  65. data/spec/data_subjects/none_spec.rb +41 -0
  66. data/spec/data_subjects/postgres_spec.rb +80 -0
  67. data/spec/{extractor → data_subjects}/s3_file_spec.rb +0 -0
  68. data/spec/data_subjects/salesforce_spec.rb +117 -0
  69. data/spec/{extractor → data_subjects}/sftp_file_spec.rb +16 -0
  70. data/spec/data_subjects/sub_job_spec.rb +33 -0
  71. data/spec/encoder_spec.rb +38 -0
  72. data/spec/extractor_spec.rb +11 -0
  73. data/spec/fixtures/sf_bulk_helper_stubs.rb +443 -0
  74. data/spec/job/transform_spec.rb +257 -0
  75. data/spec/job_spec.rb +507 -0
  76. data/spec/loader_spec.rb +11 -0
  77. data/spec/parser_spec.rb +38 -0
  78. data/spec/sf_bulk_helper_spec.rb +117 -0
  79. data/spec/testing/data_stub_spec.rb +5 -3
  80. metadata +109 -27
  81. data/features/aggregate.feature +0 -42
  82. data/jobs/aggregate_job.rb +0 -31
  83. data/jobs/transforms/transform_jobs.rb +0 -4
  84. data/lib/remi/data_subject/csv_file.rb +0 -162
  85. data/lib/remi/data_subject/data_frame.rb +0 -52
  86. data/lib/remi/data_subject/postgres.rb +0 -134
  87. data/lib/remi/data_subject/salesforce.rb +0 -136
  88. data/lib/remi/data_subject/sftp_file.rb +0 -65
  89. data/lib/remi/extractor/file_system.rb +0 -92
  90. data/lib/remi/extractor/local_file.rb +0 -43
  91. data/lib/remi/extractor/s3_file.rb +0 -57
  92. data/lib/remi/extractor/sftp_file.rb +0 -83
  93. data/spec/data_subject/csv_file_spec.rb +0 -79
  94. data/spec/data_subject/data_frame.rb +0 -27
@@ -36,7 +36,11 @@ require 'active_support/core_ext/time/calculations'
36
36
  require 'remi/version.rb'
37
37
 
38
38
  require 'remi/settings'
39
+ require 'remi/dsl'
39
40
  require 'remi/job'
41
+ require 'remi/job/parameters'
42
+ require 'remi/job/sub_job'
43
+ require 'remi/job/transform'
40
44
  require 'remi/source_to_target_map'
41
45
  require 'remi/source_to_target_map/map'
42
46
  require 'remi/source_to_target_map/row'
@@ -44,21 +48,31 @@ require 'remi/field_symbolizers'
44
48
 
45
49
  require 'remi/refinements/symbolizer'
46
50
 
47
- require 'remi/extractor/file_system'
48
- require 'remi/extractor/local_file'
49
- require 'remi/extractor/sftp_file'
50
- require 'remi/extractor/s3_file'
51
+ require 'remi/extractor'
52
+ require 'remi/parser'
53
+ require 'remi/encoder'
54
+ require 'remi/loader'
51
55
 
56
+ require 'remi/data_subject'
57
+ require 'remi/data_subjects/file_system'
58
+ require 'remi/data_subjects/local_file'
59
+ require 'remi/data_subjects/sftp_file'
60
+ require 'remi/data_subjects/s3_file'
61
+ require 'remi/data_subjects/csv_file'
62
+ #require 'remi/data_subjects/salesforce' # intentionally not included by default
63
+ require 'remi/data_subjects/postgres'
64
+ require 'remi/data_subjects/data_frame'
65
+ require 'remi/data_subjects/none'
66
+ require 'remi/data_subjects/sub_job'
52
67
 
53
68
  require 'remi/fields'
54
69
  require 'remi/data_frame'
55
70
  require 'remi/data_frame/daru'
56
71
 
57
- require 'remi/data_subject'
58
- require 'remi/data_subject/csv_file'
59
- #require 'remi/data_subject/salesforce' # intentionally not included by default
60
- require 'remi/data_subject/postgres'
61
- require 'remi/data_subject/sftp_file'
62
- require 'remi/data_subject/data_frame'
63
-
64
72
  require 'remi/transform'
73
+
74
+ require 'remi/monkeys/daru'
75
+
76
+ # Remi is Ruby Extract Modify and Integrate, a framework for writing ETL job in Ruby.
77
+ module Remi
78
+ end
@@ -1,12 +1,12 @@
1
1
  module Remi
2
2
  module DataFrame
3
3
  class << self
4
- def create(remi_df_type = :daru, *args, **kargs, &block)
5
- dataframe = case remi_df_type
4
+ def create(df_type = :daru, *args, **kargs, &block)
5
+ dataframe = case df_type
6
6
  when :daru
7
7
  Remi::DataFrame::Daru.new(*args, **kargs, &block)
8
8
  else
9
- raise TypeError, "Unknown frame type: #{remi_df_type}"
9
+ raise TypeError, "Unknown frame type: #{df_type}"
10
10
  end
11
11
  end
12
12
 
@@ -29,7 +29,7 @@ module Remi
29
29
  end
30
30
 
31
31
  # Public: Returns the type of DataFrame
32
- def remi_df_type
32
+ def df_type
33
33
  raise NoMethodError, "#{__method__} not defined for #{self.class.name}"
34
34
  end
35
35
  end
@@ -13,7 +13,7 @@ module Remi
13
13
 
14
14
 
15
15
  # Public: Returns the type of DataFrame
16
- def remi_df_type
16
+ def df_type
17
17
  :daru
18
18
  end
19
19
 
@@ -26,42 +26,6 @@ module Remi
26
26
  def self.from_hash_dump(filename)
27
27
  Marshal.load(File.binread(filename))
28
28
  end
29
-
30
- # Public: Allows the user to define an arbitrary aggregation function.
31
- #
32
- # by - The name of the DataFrame vector to use to group records.
33
- # func - A lambda function that accepts three arguments - the
34
- # first argument is the DataFrame, the second is the
35
- # key to the current group, and the third is the index
36
- # of the elements belonging to a group.
37
- #
38
- # Example:
39
- # df = Remi::DataFrame::Daru.new( { a: ['a','a','a','b','b'], year: ['2018','2015','2019', '2014', '2013'] })
40
- #
41
- # mymin = lambda do |vector, df, group_key, indices|
42
- # values = indices.map { |idx| df.row[idx][vector] }
43
- # "Group #{group_key} has a minimum value of #{values.min}"
44
- # end
45
- #
46
- # df.aggregate(by: :a, func: mymin.curry.(:year))
47
- #
48
- #
49
- # Returns a Daru::Vector.
50
- def aggregate(by:, func:)
51
- grouped = self.group_by(by)
52
- df_indices = self.index.to_a
53
- ::Daru::Vector.new(
54
- grouped.groups.reduce({}) do |h, (key, indices)|
55
- # Daru groups don't use the index of the dataframe when returning groups (WTF?).
56
- # Instead they return the position of the record in the dataframe. Here, we
57
- group_df_indices = indices.map { |v| df_indices[v] }
58
- group_key = key.size == 1 ? key.first : key
59
- h[group_key] = func.(self, group_key, group_df_indices)
60
- h
61
- end
62
- )
63
- end
64
-
65
29
  end
66
30
  end
67
31
  end
@@ -1,45 +1,92 @@
1
1
  module Remi
2
+
3
+ # The DataSubject is the parent class for DataSource and DataTarget. It is not intended
4
+ # to be used as a standalone class.
5
+ #
6
+ # A DataSubject is either a source or a target. It is largely used to associate
7
+ # a dataframe with a set of "fields" containing metadata describing how the vectors
8
+ # of the dataframe are meant to be interpreted. For example, one of the fields
9
+ # might represent a date with MM-DD-YYYY format.
10
+ #
11
+ # DataSubjects can be defined either using the standard `DataSubject.new(<args>)`
12
+ # convention, or through a DSL, which is convenient for data subjects defined
13
+ # in as part of job class definition.
2
14
  class DataSubject
3
- def initialize(*args, fields: Remi::Fields.new, remi_df_type: :daru, logger: Remi::Settings.logger, **kargs, &block)
4
- @fields = fields
5
- @remi_df_type = remi_df_type
6
- @logger = logger
15
+
16
+ # @param context [Object] the context in which the DSL is evaluated
17
+ # @param name [Symbol,String] the name of the data subject
18
+ # @param block [Proc] a block of code to be executed to define the data subject
19
+ def initialize(context=nil, name: 'NOT DEFINED', **kargs, &block)
20
+ @context = context
21
+ @name = name
22
+ @block = block
23
+ @df_type = :daru
24
+ @fields = Remi::Fields.new
25
+ @field_symbolizer = Remi::FieldSymbolizers[:standard]
26
+ end
27
+
28
+ attr_accessor :context, :name
29
+
30
+
31
+ # @param arg [Symbol] sets the type of dataframe to use for this subject
32
+ # @return [Symbol] the type of dataframe (defaults to `:daru` if not explicitly set)
33
+ def df_type(arg = nil)
34
+ return get_df_type unless arg
35
+ set_df_type arg
7
36
  end
8
37
 
9
- # Public: Fields defined for this data subject
10
- attr_accessor :fields
38
+ # @param arg [Hash, Remi::Fields] set the field metadata for this data subject
39
+ # @return [Remi::Fields] the field metadata for this data subject
40
+ def fields(arg = nil)
41
+ return get_fields unless arg
42
+ set_fields arg
43
+ end
11
44
 
12
- # Public: The default method for symbolizing field names
13
- def field_symbolizer
14
- Remi::FieldSymbolizers[:standard]
45
+ # @param arg [Hash, Remi::Fields] set the field metadata for this data subject
46
+ # @return [Remi::Fields] the field metadata for this data subject
47
+ def fields=(arg)
48
+ @fields = Remi::Fields.new(arg)
15
49
  end
16
50
 
17
- # Public: Access the dataframe from a DataSource
51
+ # Field symbolizer used to convert field names into symbols. This method sets
52
+ # the symbolizer for the data subject and also sets the symbolizers for
53
+ # any associated parser and encoders.
18
54
  #
19
- # Returns a Remi::DataFrame
55
+ # @return [Proc] the method for symbolizing field names
56
+ def field_symbolizer(arg = nil)
57
+ return @field_symbolizer unless arg
58
+ @field_symbolizer = if arg.is_a? Symbol
59
+ Remi::FieldSymbolizers[arg]
60
+ else
61
+ arg
62
+ end
63
+ end
64
+
65
+ # @return [Remi::DataFrame] the dataframe associated with this DataSubject
20
66
  def df
21
- @dataframe ||= Remi::DataFrame.create(@remi_df_type, [], order: @fields.keys)
67
+ @dataframe ||= Remi::DataFrame.create(df_type, [], order: fields.keys)
22
68
  end
23
69
 
24
- # Public: Reassigns the dataframe associated with this subject
25
- #
26
- # Returns the assigned dataframe
70
+ # Reassigns the dataframe associated with this DataSubject.
71
+ # @param new_dataframe [Object] The new dataframe object to be associated.
72
+ # @return [Remi::DataFrame] the associated dataframe
27
73
  def df=(new_dataframe)
28
- if new_dataframe.respond_to? :remi_df_type
74
+ if new_dataframe.respond_to? :df_type
29
75
  @dataframe = new_dataframe
30
76
  else
31
- @dataframe = Remi::DataFrame.create(@remi_df_type, new_dataframe)
77
+ @dataframe = Remi::DataFrame.create(df_type, new_dataframe)
32
78
  end
33
79
  end
34
80
 
35
- # Public: Enforces types defined in the field metadata.
36
- # For example, if a field has metadata with type: :date, then the
37
- # type enforcer will convert data in that field into a date, and will
81
+ # Enforces the types defined in the field metadata. Throws an
82
+ # error if a data element does not conform to the type. For
83
+ # example, if a field has metadata with type: :date, then the type
84
+ # enforcer will convert data in that field into a date, and will
38
85
  # throw an error if it is unable to parse any of the values.
39
86
  #
40
- # types - If set, restricts the data types that are enforced to just those listed.
41
- #
42
- # Returns nothing.
87
+ # @param types [Array<Symbol>] a list of metadata types to use to enforce. If none are given,
88
+ # all types are enforced.
89
+ # @return [self]
43
90
  def enforce_types(*types)
44
91
  sttm = SourceToTargetMap.new(df, source_metadata: fields)
45
92
  fields.keys.each do |field|
@@ -47,63 +94,202 @@ module Remi
47
94
  sttm.source(field).target(field).transform(Remi::Transform::EnforceType.new).execute
48
95
  end
49
96
 
50
- nil
97
+ self
98
+ end
99
+
100
+ # Defines the subject using the DSL in the block provided
101
+ #
102
+ # @return [self]
103
+ def dsl_eval
104
+ dsl_eval! unless @dsl_evaluated
105
+ @dsl_evaluated = true
106
+ self
107
+ end
108
+
109
+ def dsl_eval!
110
+ return self unless @block
111
+ Dsl.dsl_eval(self, @context, &@block)
112
+ end
113
+
114
+ private
115
+
116
+ def set_fields(arg)
117
+ self.fields = arg
118
+ end
119
+
120
+ def get_fields
121
+ dsl_eval
122
+ @fields
123
+ end
124
+
125
+ def set_df_type(arg)
126
+ @df_type = arg
127
+ end
128
+
129
+ def get_df_type
130
+ dsl_eval
131
+ @df_type
51
132
  end
52
133
  end
53
134
 
54
135
 
136
+
137
+ # The DataSource is a DataSubject meant to extract data from an external source
138
+ # and convert (parse) it into a dataframe.
139
+ #
140
+ # @example
141
+ #
142
+ # my_data_source = DataSource.new do
143
+ # extractor some_extractor
144
+ # parser some_parser
145
+ # end
146
+ #
147
+ # my_data_source.df #=> Returns a dataframe that is created by extracting data
148
+ # # from some_extractor and parsing it using some_parser.
55
149
  class DataSource < DataSubject
56
150
 
57
- # Public: Access the dataframe from a DataSource
151
+ def initialize(*args, **kargs, &block)
152
+ @parser = Parser::None.new
153
+ @parser.context = self
154
+ super
155
+ end
156
+
157
+ # @return [Array] the list of extractors that are defined for this data source
158
+ def extractors
159
+ @extractors ||= []
160
+ end
161
+
162
+ # @param obj [Object] adds an extractor object to the list of extractors
163
+ # @return [Array] the full list of extractors
164
+ def extractor(obj)
165
+ extractors << obj unless extractors.include? obj
166
+ end
167
+
168
+ # @param obj [Object] sets the parser for this data source
169
+ # @return [Object] the parser set for this data source
170
+ def parser(obj = nil)
171
+ return @parser unless obj
172
+ obj.context = self
173
+
174
+ @parser = obj
175
+ end
176
+
177
+ # Extracts data from all of the extractors.
178
+ # @return [Array] the result of each extractor
179
+ def extract!
180
+ extractors.map { |e| e.extract }
181
+ end
182
+
183
+ # Converts all of the extracted data to a dataframe
184
+ # @return [Remi::DataFrame]
185
+ def parse
186
+ parser.parse *extract
187
+ end
188
+
189
+ # The dataframe will only be extracted and parsed once, and only if it
190
+ # has not already been set (e.g., using #df=).
58
191
  #
59
- # Returns a Remi::DataFrame
192
+ # @return [Remi::DataFrame] the dataframe associated with this DataSubject
60
193
  def df
61
- @dataframe ||= to_dataframe
194
+ @dataframe ||= parsed_as_dataframe
62
195
  end
63
196
 
64
- # Public: Memoized version of extract!
197
+ # This clears any previously extracted and parsed results.
198
+ # A subsequent call to #df will redo the extract and parse.
199
+ #
200
+ # @return [Remi::DataFrame] the dataframe associated with this DataSubject
201
+ def reset
202
+ @block = nil
203
+ @dataframe = nil
204
+ @extract = nil
205
+ end
206
+
207
+ # @return [Array<Object>] all of the data extracted from the extractors (memoized).
65
208
  def extract
66
209
  @extract ||= extract!
67
210
  end
68
211
 
69
- # Public: Called to extract data from the source.
70
- #
71
- # Returns data in a format that can be used to create a dataframe.
72
- def extract!
73
- raise NoMethodError, "#{__method__} not defined for #{self.class.name}"
74
- @extract
75
- end
76
212
 
77
- # Public: Converts extracted data to a dataframe
78
- #
79
- # Returns a Remi::DataFrame
80
- def to_dataframe
81
- raise NoMethodError, "#{__method__} not defined for #{self.class.name}"
213
+ private
214
+
215
+ # Runs the DSL definitions and all extracts, parses, and enforced types
216
+ # @return [Remi::DataFrame] the source extracted and parsed as a dataframe
217
+ def parsed_as_dataframe
218
+ dsl_eval if @block
219
+ dataframe = parse
220
+ dataframe
82
221
  end
83
222
  end
84
223
 
85
224
 
225
+ # The DataTarget is a DataSubject meant to load data from an associated dataframe
226
+ # into one or more target systems.
227
+ #
228
+ # @example
229
+ #
230
+ # my_data_target = DataTarget.new do
231
+ # encoder some_encoder
232
+ # loader some_loader
233
+ # end
234
+ #
235
+ # my_data_target.df = some_great_dataframe
236
+ # my_data_target.load #=> loads data from the dataframe into some target defined by some_loader
86
237
  class DataTarget < DataSubject
87
238
 
88
- # Public: Loads data to the target. This is automatically called
239
+ def initialize(*args, **kargs, &block)
240
+ @encoder = Encoder::None.new
241
+ @encoder.context = self
242
+ super
243
+ end
244
+
245
+ # @param obj [Object] sets the encoder for this data target
246
+ # @return [Object] the encoder set for this data source
247
+ def encoder(obj = nil)
248
+ return @encoder unless obj
249
+ obj.context = self
250
+
251
+ @encoder = obj
252
+ end
253
+
254
+ # @return [Array] the list of loaders associated with the this data target
255
+ def loaders
256
+ @loaders ||= []
257
+ end
258
+
259
+ # @param obj [Object] adds a loader object to the list of loaders
260
+ # @return [Array] the full list of loaders
261
+ def loader(obj)
262
+ loaders << obj unless loaders.include? obj
263
+ end
264
+
265
+ # Loads data to all targets. This is automatically called
89
266
  # after all transforms have executed, but could also get called manually.
90
267
  # The actual load operation is only executed if hasn't already.
91
268
  #
92
- # Returns true if the load operation was successful.
269
+ # @return [true] if successful
93
270
  def load
94
- return true if @loaded || df.size == 0
271
+ return nil if @loaded || df.size == 0
272
+ dsl_eval if @block
95
273
 
96
- @loaded = load!
274
+ load!
275
+ @loaded = true
97
276
  end
98
277
 
99
- # Public: Performs the load operation, regardless of whether it has
278
+ # Performs the load operation, regardless of whether it has
100
279
  # already executed.
101
280
  #
102
- # Returns true if the load operation was successful
281
+ # @return [nil] nothing
103
282
  def load!
104
- raise NoMethodError, "#{__method__} not defined for #{self.class.name}"
283
+ loaders.each { |l| l.load encoded_dataframe }
284
+ true
285
+ end
286
+
287
+ private
105
288
 
106
- false
289
+ # @return [Object] the encoded data suitable for the loaders
290
+ def encoded_dataframe
291
+ @encoded_dataframe ||= encoder.encode df
107
292
  end
293
+
108
294
  end
109
295
  end