remi 0.2.42 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (94) hide show
  1. checksums.yaml +4 -4
  2. data/.yardopts +7 -0
  3. data/Gemfile +1 -1
  4. data/Gemfile.lock +13 -26
  5. data/README.md +1 -1
  6. data/features/step_definitions/remi_step.rb +33 -13
  7. data/features/sub_job_example.feature +24 -0
  8. data/features/sub_transform_example.feature +35 -0
  9. data/features/sub_transform_many_to_many.feature +49 -0
  10. data/features/support/env_app.rb +1 -1
  11. data/jobs/all_jobs_shared.rb +19 -16
  12. data/jobs/copy_source_job.rb +11 -9
  13. data/jobs/csv_file_target_job.rb +10 -9
  14. data/jobs/json_job.rb +18 -14
  15. data/jobs/metadata_job.rb +33 -28
  16. data/jobs/parameters_job.rb +14 -11
  17. data/jobs/sample_job.rb +106 -77
  18. data/jobs/sftp_file_target_job.rb +14 -13
  19. data/jobs/sub_job_example_job.rb +86 -0
  20. data/jobs/sub_transform_example_job.rb +43 -0
  21. data/jobs/sub_transform_many_to_many_job.rb +46 -0
  22. data/jobs/transforms/concatenate_job.rb +16 -12
  23. data/jobs/transforms/data_frame_sieve_job.rb +24 -19
  24. data/jobs/transforms/date_diff_job.rb +15 -11
  25. data/jobs/transforms/nvl_job.rb +16 -12
  26. data/jobs/transforms/parse_date_job.rb +17 -14
  27. data/jobs/transforms/partitioner_job.rb +27 -19
  28. data/jobs/transforms/prefix_job.rb +13 -10
  29. data/jobs/transforms/truncate_job.rb +14 -10
  30. data/jobs/transforms/truthy_job.rb +11 -8
  31. data/lib/remi.rb +25 -11
  32. data/lib/remi/data_frame.rb +4 -4
  33. data/lib/remi/data_frame/daru.rb +1 -37
  34. data/lib/remi/data_subject.rb +234 -48
  35. data/lib/remi/data_subjects/csv_file.rb +171 -0
  36. data/lib/remi/data_subjects/data_frame.rb +106 -0
  37. data/lib/remi/data_subjects/file_system.rb +115 -0
  38. data/lib/remi/data_subjects/local_file.rb +109 -0
  39. data/lib/remi/data_subjects/none.rb +31 -0
  40. data/lib/remi/data_subjects/postgres.rb +186 -0
  41. data/lib/remi/data_subjects/s3_file.rb +84 -0
  42. data/lib/remi/data_subjects/salesforce.rb +211 -0
  43. data/lib/remi/data_subjects/sftp_file.rb +196 -0
  44. data/lib/remi/data_subjects/sub_job.rb +50 -0
  45. data/lib/remi/dsl.rb +74 -0
  46. data/lib/remi/encoder.rb +45 -0
  47. data/lib/remi/extractor.rb +21 -0
  48. data/lib/remi/field_symbolizers.rb +1 -0
  49. data/lib/remi/job.rb +279 -113
  50. data/lib/remi/job/parameters.rb +90 -0
  51. data/lib/remi/job/sub_job.rb +35 -0
  52. data/lib/remi/job/transform.rb +165 -0
  53. data/lib/remi/loader.rb +22 -0
  54. data/lib/remi/monkeys/daru.rb +4 -0
  55. data/lib/remi/parser.rb +44 -0
  56. data/lib/remi/testing/business_rules.rb +17 -23
  57. data/lib/remi/testing/data_stub.rb +2 -2
  58. data/lib/remi/version.rb +1 -1
  59. data/remi.gemspec +3 -0
  60. data/spec/data_subject_spec.rb +475 -11
  61. data/spec/data_subjects/csv_file_spec.rb +69 -0
  62. data/spec/data_subjects/data_frame_spec.rb +52 -0
  63. data/spec/{extractor → data_subjects}/file_system_spec.rb +0 -0
  64. data/spec/{extractor → data_subjects}/local_file_spec.rb +0 -0
  65. data/spec/data_subjects/none_spec.rb +41 -0
  66. data/spec/data_subjects/postgres_spec.rb +80 -0
  67. data/spec/{extractor → data_subjects}/s3_file_spec.rb +0 -0
  68. data/spec/data_subjects/salesforce_spec.rb +117 -0
  69. data/spec/{extractor → data_subjects}/sftp_file_spec.rb +16 -0
  70. data/spec/data_subjects/sub_job_spec.rb +33 -0
  71. data/spec/encoder_spec.rb +38 -0
  72. data/spec/extractor_spec.rb +11 -0
  73. data/spec/fixtures/sf_bulk_helper_stubs.rb +443 -0
  74. data/spec/job/transform_spec.rb +257 -0
  75. data/spec/job_spec.rb +507 -0
  76. data/spec/loader_spec.rb +11 -0
  77. data/spec/parser_spec.rb +38 -0
  78. data/spec/sf_bulk_helper_spec.rb +117 -0
  79. data/spec/testing/data_stub_spec.rb +5 -3
  80. metadata +109 -27
  81. data/features/aggregate.feature +0 -42
  82. data/jobs/aggregate_job.rb +0 -31
  83. data/jobs/transforms/transform_jobs.rb +0 -4
  84. data/lib/remi/data_subject/csv_file.rb +0 -162
  85. data/lib/remi/data_subject/data_frame.rb +0 -52
  86. data/lib/remi/data_subject/postgres.rb +0 -134
  87. data/lib/remi/data_subject/salesforce.rb +0 -136
  88. data/lib/remi/data_subject/sftp_file.rb +0 -65
  89. data/lib/remi/extractor/file_system.rb +0 -92
  90. data/lib/remi/extractor/local_file.rb +0 -43
  91. data/lib/remi/extractor/s3_file.rb +0 -57
  92. data/lib/remi/extractor/sftp_file.rb +0 -83
  93. data/spec/data_subject/csv_file_spec.rb +0 -79
  94. data/spec/data_subject/data_frame.rb +0 -27
@@ -36,7 +36,11 @@ require 'active_support/core_ext/time/calculations'
36
36
  require 'remi/version.rb'
37
37
 
38
38
  require 'remi/settings'
39
+ require 'remi/dsl'
39
40
  require 'remi/job'
41
+ require 'remi/job/parameters'
42
+ require 'remi/job/sub_job'
43
+ require 'remi/job/transform'
40
44
  require 'remi/source_to_target_map'
41
45
  require 'remi/source_to_target_map/map'
42
46
  require 'remi/source_to_target_map/row'
@@ -44,21 +48,31 @@ require 'remi/field_symbolizers'
44
48
 
45
49
  require 'remi/refinements/symbolizer'
46
50
 
47
- require 'remi/extractor/file_system'
48
- require 'remi/extractor/local_file'
49
- require 'remi/extractor/sftp_file'
50
- require 'remi/extractor/s3_file'
51
+ require 'remi/extractor'
52
+ require 'remi/parser'
53
+ require 'remi/encoder'
54
+ require 'remi/loader'
51
55
 
56
+ require 'remi/data_subject'
57
+ require 'remi/data_subjects/file_system'
58
+ require 'remi/data_subjects/local_file'
59
+ require 'remi/data_subjects/sftp_file'
60
+ require 'remi/data_subjects/s3_file'
61
+ require 'remi/data_subjects/csv_file'
62
+ #require 'remi/data_subjects/salesforce' # intentionally not included by default
63
+ require 'remi/data_subjects/postgres'
64
+ require 'remi/data_subjects/data_frame'
65
+ require 'remi/data_subjects/none'
66
+ require 'remi/data_subjects/sub_job'
52
67
 
53
68
  require 'remi/fields'
54
69
  require 'remi/data_frame'
55
70
  require 'remi/data_frame/daru'
56
71
 
57
- require 'remi/data_subject'
58
- require 'remi/data_subject/csv_file'
59
- #require 'remi/data_subject/salesforce' # intentionally not included by default
60
- require 'remi/data_subject/postgres'
61
- require 'remi/data_subject/sftp_file'
62
- require 'remi/data_subject/data_frame'
63
-
64
72
  require 'remi/transform'
73
+
74
+ require 'remi/monkeys/daru'
75
+
76
+ # Remi is Ruby Extract Modify and Integrate, a framework for writing ETL job in Ruby.
77
+ module Remi
78
+ end
@@ -1,12 +1,12 @@
1
1
  module Remi
2
2
  module DataFrame
3
3
  class << self
4
- def create(remi_df_type = :daru, *args, **kargs, &block)
5
- dataframe = case remi_df_type
4
+ def create(df_type = :daru, *args, **kargs, &block)
5
+ dataframe = case df_type
6
6
  when :daru
7
7
  Remi::DataFrame::Daru.new(*args, **kargs, &block)
8
8
  else
9
- raise TypeError, "Unknown frame type: #{remi_df_type}"
9
+ raise TypeError, "Unknown frame type: #{df_type}"
10
10
  end
11
11
  end
12
12
 
@@ -29,7 +29,7 @@ module Remi
29
29
  end
30
30
 
31
31
  # Public: Returns the type of DataFrame
32
- def remi_df_type
32
+ def df_type
33
33
  raise NoMethodError, "#{__method__} not defined for #{self.class.name}"
34
34
  end
35
35
  end
@@ -13,7 +13,7 @@ module Remi
13
13
 
14
14
 
15
15
  # Public: Returns the type of DataFrame
16
- def remi_df_type
16
+ def df_type
17
17
  :daru
18
18
  end
19
19
 
@@ -26,42 +26,6 @@ module Remi
26
26
  def self.from_hash_dump(filename)
27
27
  Marshal.load(File.binread(filename))
28
28
  end
29
-
30
- # Public: Allows the user to define an arbitrary aggregation function.
31
- #
32
- # by - The name of the DataFrame vector to use to group records.
33
- # func - A lambda function that accepts three arguments - the
34
- # first argument is the DataFrame, the second is the
35
- # key to the current group, and the third is the index
36
- # of the elements belonging to a group.
37
- #
38
- # Example:
39
- # df = Remi::DataFrame::Daru.new( { a: ['a','a','a','b','b'], year: ['2018','2015','2019', '2014', '2013'] })
40
- #
41
- # mymin = lambda do |vector, df, group_key, indices|
42
- # values = indices.map { |idx| df.row[idx][vector] }
43
- # "Group #{group_key} has a minimum value of #{values.min}"
44
- # end
45
- #
46
- # df.aggregate(by: :a, func: mymin.curry.(:year))
47
- #
48
- #
49
- # Returns a Daru::Vector.
50
- def aggregate(by:, func:)
51
- grouped = self.group_by(by)
52
- df_indices = self.index.to_a
53
- ::Daru::Vector.new(
54
- grouped.groups.reduce({}) do |h, (key, indices)|
55
- # Daru groups don't use the index of the dataframe when returning groups (WTF?).
56
- # Instead they return the position of the record in the dataframe. Here, we
57
- group_df_indices = indices.map { |v| df_indices[v] }
58
- group_key = key.size == 1 ? key.first : key
59
- h[group_key] = func.(self, group_key, group_df_indices)
60
- h
61
- end
62
- )
63
- end
64
-
65
29
  end
66
30
  end
67
31
  end
@@ -1,45 +1,92 @@
1
1
  module Remi
2
+
3
+ # The DataSubject is the parent class for DataSource and DataTarget. It is not intended
4
+ # to be used as a standalone class.
5
+ #
6
+ # A DataSubject is either a source or a target. It is largely used to associate
7
+ # a dataframe with a set of "fields" containing metadata describing how the vectors
8
+ # of the dataframe are meant to be interpreted. For example, one of the fields
9
+ # might represent a date with MM-DD-YYYY format.
10
+ #
11
+ # DataSubjects can be defined either using the standard `DataSubject.new(<args>)`
12
+ # convention, or through a DSL, which is convenient for data subjects defined
13
+ # in as part of job class definition.
2
14
  class DataSubject
3
- def initialize(*args, fields: Remi::Fields.new, remi_df_type: :daru, logger: Remi::Settings.logger, **kargs, &block)
4
- @fields = fields
5
- @remi_df_type = remi_df_type
6
- @logger = logger
15
+
16
+ # @param context [Object] the context in which the DSL is evaluated
17
+ # @param name [Symbol,String] the name of the data subject
18
+ # @param block [Proc] a block of code to be executed to define the data subject
19
+ def initialize(context=nil, name: 'NOT DEFINED', **kargs, &block)
20
+ @context = context
21
+ @name = name
22
+ @block = block
23
+ @df_type = :daru
24
+ @fields = Remi::Fields.new
25
+ @field_symbolizer = Remi::FieldSymbolizers[:standard]
26
+ end
27
+
28
+ attr_accessor :context, :name
29
+
30
+
31
+ # @param arg [Symbol] sets the type of dataframe to use for this subject
32
+ # @return [Symbol] the type of dataframe (defaults to `:daru` if not explicitly set)
33
+ def df_type(arg = nil)
34
+ return get_df_type unless arg
35
+ set_df_type arg
7
36
  end
8
37
 
9
- # Public: Fields defined for this data subject
10
- attr_accessor :fields
38
+ # @param arg [Hash, Remi::Fields] set the field metadata for this data subject
39
+ # @return [Remi::Fields] the field metadata for this data subject
40
+ def fields(arg = nil)
41
+ return get_fields unless arg
42
+ set_fields arg
43
+ end
11
44
 
12
- # Public: The default method for symbolizing field names
13
- def field_symbolizer
14
- Remi::FieldSymbolizers[:standard]
45
+ # @param arg [Hash, Remi::Fields] set the field metadata for this data subject
46
+ # @return [Remi::Fields] the field metadata for this data subject
47
+ def fields=(arg)
48
+ @fields = Remi::Fields.new(arg)
15
49
  end
16
50
 
17
- # Public: Access the dataframe from a DataSource
51
+ # Field symbolizer used to convert field names into symbols. This method sets
52
+ # the symbolizer for the data subject and also sets the symbolizers for
53
+ # any associated parser and encoders.
18
54
  #
19
- # Returns a Remi::DataFrame
55
+ # @return [Proc] the method for symbolizing field names
56
+ def field_symbolizer(arg = nil)
57
+ return @field_symbolizer unless arg
58
+ @field_symbolizer = if arg.is_a? Symbol
59
+ Remi::FieldSymbolizers[arg]
60
+ else
61
+ arg
62
+ end
63
+ end
64
+
65
+ # @return [Remi::DataFrame] the dataframe associated with this DataSubject
20
66
  def df
21
- @dataframe ||= Remi::DataFrame.create(@remi_df_type, [], order: @fields.keys)
67
+ @dataframe ||= Remi::DataFrame.create(df_type, [], order: fields.keys)
22
68
  end
23
69
 
24
- # Public: Reassigns the dataframe associated with this subject
25
- #
26
- # Returns the assigned dataframe
70
+ # Reassigns the dataframe associated with this DataSubject.
71
+ # @param new_dataframe [Object] The new dataframe object to be associated.
72
+ # @return [Remi::DataFrame] the associated dataframe
27
73
  def df=(new_dataframe)
28
- if new_dataframe.respond_to? :remi_df_type
74
+ if new_dataframe.respond_to? :df_type
29
75
  @dataframe = new_dataframe
30
76
  else
31
- @dataframe = Remi::DataFrame.create(@remi_df_type, new_dataframe)
77
+ @dataframe = Remi::DataFrame.create(df_type, new_dataframe)
32
78
  end
33
79
  end
34
80
 
35
- # Public: Enforces types defined in the field metadata.
36
- # For example, if a field has metadata with type: :date, then the
37
- # type enforcer will convert data in that field into a date, and will
81
+ # Enforces the types defined in the field metadata. Throws an
82
+ # error if a data element does not conform to the type. For
83
+ # example, if a field has metadata with type: :date, then the type
84
+ # enforcer will convert data in that field into a date, and will
38
85
  # throw an error if it is unable to parse any of the values.
39
86
  #
40
- # types - If set, restricts the data types that are enforced to just those listed.
41
- #
42
- # Returns nothing.
87
+ # @param types [Array<Symbol>] a list of metadata types to use to enforce. If none are given,
88
+ # all types are enforced.
89
+ # @return [self]
43
90
  def enforce_types(*types)
44
91
  sttm = SourceToTargetMap.new(df, source_metadata: fields)
45
92
  fields.keys.each do |field|
@@ -47,63 +94,202 @@ module Remi
47
94
  sttm.source(field).target(field).transform(Remi::Transform::EnforceType.new).execute
48
95
  end
49
96
 
50
- nil
97
+ self
98
+ end
99
+
100
+ # Defines the subject using the DSL in the block provided
101
+ #
102
+ # @return [self]
103
+ def dsl_eval
104
+ dsl_eval! unless @dsl_evaluated
105
+ @dsl_evaluated = true
106
+ self
107
+ end
108
+
109
+ def dsl_eval!
110
+ return self unless @block
111
+ Dsl.dsl_eval(self, @context, &@block)
112
+ end
113
+
114
+ private
115
+
116
+ def set_fields(arg)
117
+ self.fields = arg
118
+ end
119
+
120
+ def get_fields
121
+ dsl_eval
122
+ @fields
123
+ end
124
+
125
+ def set_df_type(arg)
126
+ @df_type = arg
127
+ end
128
+
129
+ def get_df_type
130
+ dsl_eval
131
+ @df_type
51
132
  end
52
133
  end
53
134
 
54
135
 
136
+
137
+ # The DataSource is a DataSubject meant to extract data from an external source
138
+ # and convert (parse) it into a dataframe.
139
+ #
140
+ # @example
141
+ #
142
+ # my_data_source = DataSource.new do
143
+ # extractor some_extractor
144
+ # parser some_parser
145
+ # end
146
+ #
147
+ # my_data_source.df #=> Returns a dataframe that is created by extracting data
148
+ # # from some_extractor and parsing it using some_parser.
55
149
  class DataSource < DataSubject
56
150
 
57
- # Public: Access the dataframe from a DataSource
151
+ def initialize(*args, **kargs, &block)
152
+ @parser = Parser::None.new
153
+ @parser.context = self
154
+ super
155
+ end
156
+
157
+ # @return [Array] the list of extractors that are defined for this data source
158
+ def extractors
159
+ @extractors ||= []
160
+ end
161
+
162
+ # @param obj [Object] adds an extractor object to the list of extractors
163
+ # @return [Array] the full list of extractors
164
+ def extractor(obj)
165
+ extractors << obj unless extractors.include? obj
166
+ end
167
+
168
+ # @param obj [Object] sets the parser for this data source
169
+ # @return [Object] the parser set for this data source
170
+ def parser(obj = nil)
171
+ return @parser unless obj
172
+ obj.context = self
173
+
174
+ @parser = obj
175
+ end
176
+
177
+ # Extracts data from all of the extractors.
178
+ # @return [Array] the result of each extractor
179
+ def extract!
180
+ extractors.map { |e| e.extract }
181
+ end
182
+
183
+ # Converts all of the extracted data to a dataframe
184
+ # @return [Remi::DataFrame]
185
+ def parse
186
+ parser.parse *extract
187
+ end
188
+
189
+ # The dataframe will only be extracted and parsed once, and only if it
190
+ # has not already been set (e.g., using #df=).
58
191
  #
59
- # Returns a Remi::DataFrame
192
+ # @return [Remi::DataFrame] the dataframe associated with this DataSubject
60
193
  def df
61
- @dataframe ||= to_dataframe
194
+ @dataframe ||= parsed_as_dataframe
62
195
  end
63
196
 
64
- # Public: Memoized version of extract!
197
+ # This clears any previously extracted and parsed results.
198
+ # A subsequent call to #df will redo the extract and parse.
199
+ #
200
+ # @return [Remi::DataFrame] the dataframe associated with this DataSubject
201
+ def reset
202
+ @block = nil
203
+ @dataframe = nil
204
+ @extract = nil
205
+ end
206
+
207
+ # @return [Array<Object>] all of the data extracted from the extractors (memoized).
65
208
  def extract
66
209
  @extract ||= extract!
67
210
  end
68
211
 
69
- # Public: Called to extract data from the source.
70
- #
71
- # Returns data in a format that can be used to create a dataframe.
72
- def extract!
73
- raise NoMethodError, "#{__method__} not defined for #{self.class.name}"
74
- @extract
75
- end
76
212
 
77
- # Public: Converts extracted data to a dataframe
78
- #
79
- # Returns a Remi::DataFrame
80
- def to_dataframe
81
- raise NoMethodError, "#{__method__} not defined for #{self.class.name}"
213
+ private
214
+
215
+ # Runs the DSL definitions and all extracts, parses, and enforced types
216
+ # @return [Remi::DataFrame] the source extracted and parsed as a dataframe
217
+ def parsed_as_dataframe
218
+ dsl_eval if @block
219
+ dataframe = parse
220
+ dataframe
82
221
  end
83
222
  end
84
223
 
85
224
 
225
+ # The DataTarget is a DataSubject meant to load data from an associated dataframe
226
+ # into one or more target systems.
227
+ #
228
+ # @example
229
+ #
230
+ # my_data_target = DataTarget.new do
231
+ # encoder some_encoder
232
+ # loader some_loader
233
+ # end
234
+ #
235
+ # my_data_target.df = some_great_dataframe
236
+ # my_data_target.load #=> loads data from the dataframe into some target defined by some_loader
86
237
  class DataTarget < DataSubject
87
238
 
88
- # Public: Loads data to the target. This is automatically called
239
+ def initialize(*args, **kargs, &block)
240
+ @encoder = Encoder::None.new
241
+ @encoder.context = self
242
+ super
243
+ end
244
+
245
+ # @param obj [Object] sets the encoder for this data target
246
+ # @return [Object] the encoder set for this data source
247
+ def encoder(obj = nil)
248
+ return @encoder unless obj
249
+ obj.context = self
250
+
251
+ @encoder = obj
252
+ end
253
+
254
+ # @return [Array] the list of loaders associated with the this data target
255
+ def loaders
256
+ @loaders ||= []
257
+ end
258
+
259
+ # @param obj [Object] adds a loader object to the list of loaders
260
+ # @return [Array] the full list of loaders
261
+ def loader(obj)
262
+ loaders << obj unless loaders.include? obj
263
+ end
264
+
265
+ # Loads data to all targets. This is automatically called
89
266
  # after all transforms have executed, but could also get called manually.
90
267
  # The actual load operation is only executed if hasn't already.
91
268
  #
92
- # Returns true if the load operation was successful.
269
+ # @return [true] if successful
93
270
  def load
94
- return true if @loaded || df.size == 0
271
+ return nil if @loaded || df.size == 0
272
+ dsl_eval if @block
95
273
 
96
- @loaded = load!
274
+ load!
275
+ @loaded = true
97
276
  end
98
277
 
99
- # Public: Performs the load operation, regardless of whether it has
278
+ # Performs the load operation, regardless of whether it has
100
279
  # already executed.
101
280
  #
102
- # Returns true if the load operation was successful
281
+ # @return [nil] nothing
103
282
  def load!
104
- raise NoMethodError, "#{__method__} not defined for #{self.class.name}"
283
+ loaders.each { |l| l.load encoded_dataframe }
284
+ true
285
+ end
286
+
287
+ private
105
288
 
106
- false
289
+ # @return [Object] the encoded data suitable for the loaders
290
+ def encoded_dataframe
291
+ @encoded_dataframe ||= encoder.encode df
107
292
  end
293
+
108
294
  end
109
295
  end