remi 0.3.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/remi.rb CHANGED
@@ -4,6 +4,8 @@ File.expand_path(File.dirname(__FILE__)).tap {|pwd| $LOAD_PATH.unshift(pwd) unle
4
4
  require 'yaml'
5
5
  require 'json'
6
6
  require 'tmpdir'
7
+ require 'fileutils'
8
+
7
9
 
8
10
  # Gems
9
11
  require 'daru'
@@ -56,10 +58,11 @@ require 'remi/loader'
56
58
  require 'remi/data_subject'
57
59
  require 'remi/data_subjects/file_system'
58
60
  require 'remi/data_subjects/local_file'
61
+ #require 'remi/data_subjects/gsheet' # intentionally not included by default (must be optionally added)
59
62
  require 'remi/data_subjects/sftp_file'
60
63
  require 'remi/data_subjects/s3_file'
61
64
  require 'remi/data_subjects/csv_file'
62
- #require 'remi/data_subjects/salesforce' # intentionally not included by default
65
+ #require 'remi/data_subjects/salesforce' # intentionally not included by default (must be optionally added)
63
66
  require 'remi/data_subjects/postgres'
64
67
  require 'remi/data_subjects/data_frame'
65
68
  require 'remi/data_subjects/none'
@@ -64,6 +64,7 @@ module Remi
64
64
 
65
65
  # @return [Remi::DataFrame] the dataframe associated with this DataSubject
66
66
  def df
67
+ dsl_eval
67
68
  @dataframe ||= Remi::DataFrame.create(df_type, [], order: fields.keys)
68
69
  end
69
70
 
@@ -71,6 +72,7 @@ module Remi
71
72
  # @param new_dataframe [Object] The new dataframe object to be associated.
72
73
  # @return [Remi::DataFrame] the associated dataframe
73
74
  def df=(new_dataframe)
75
+ dsl_eval
74
76
  if new_dataframe.respond_to? :df_type
75
77
  @dataframe = new_dataframe
76
78
  else
@@ -259,6 +261,7 @@ module Remi
259
261
  # @param obj [Object] adds a loader object to the list of loaders
260
262
  # @return [Array] the full list of loaders
261
263
  def loader(obj)
264
+ obj.context = self
262
265
  loaders << obj unless loaders.include? obj
263
266
  end
264
267
 
@@ -269,7 +272,7 @@ module Remi
269
272
  # @return [true] if successful
270
273
  def load
271
274
  return nil if @loaded || df.size == 0
272
- dsl_eval if @block
275
+ dsl_eval
273
276
 
274
277
  load!
275
278
  @loaded = true
@@ -284,6 +287,12 @@ module Remi
284
287
  true
285
288
  end
286
289
 
290
+ def df=(new_dataframe)
291
+ super
292
+ loaders.each { |l| l.load encoded_dataframe if l.autoload }
293
+ df
294
+ end
295
+
287
296
  private
288
297
 
289
298
  # @return [Object] the encoded data suitable for the loaders
@@ -50,6 +50,8 @@ module Remi
50
50
  attr_reader :most_recent_only
51
51
  attr_reader :group_by
52
52
  attr_reader :most_recent_by
53
+ attr_reader :created_within
54
+
53
55
 
54
56
  # Public: Called to extract files from the source filesystem.
55
57
  #
@@ -70,6 +72,8 @@ module Remi
70
72
  most_recent_matching_entry_in_group
71
73
  elsif @most_recent_only
72
74
  Array(most_recent_matching_entry)
75
+ elsif @created_within
76
+ get_created_within
73
77
  else
74
78
  matching_entries
75
79
  end
@@ -83,6 +87,29 @@ module Remi
83
87
  matching_entries.sort_by { |e| e.send(@most_recent_by) }.reverse.first
84
88
  end
85
89
 
90
+ def get_created_within
91
+
92
+ if @most_recent_only
93
+ first_entry = matching_entries.sort_by { |e| e.send(@most_recent_by)}.reverse.first
94
+ if ((Date.today.to_time - Time.at(first_entry.create_time)) / 1.hour) < @created_within
95
+ Array(first_entry)
96
+ else
97
+ raise "No file Found. All files are older than #{@created_within} hrs"
98
+ end
99
+ else
100
+ entries_with_group = matching_entries.map do |entry|
101
+ if ((Time.new.to_time - Time.at(entry.create_time) ) / 1.seconds) < @created_within
102
+ entry
103
+ end
104
+ end.compact
105
+ if entries_with_group.length > 0
106
+ Array(entries_with_group)
107
+ else
108
+ raise "No files Found. All files are older than #{@created_within} hrs"
109
+ end
110
+ end
111
+ end
112
+
86
113
  def most_recent_matching_entry_in_group
87
114
  entries_with_group = matching_entries.map do |entry|
88
115
  match = entry.name.match(@group_by)
@@ -103,13 +130,16 @@ module Remi
103
130
 
104
131
  private
105
132
 
106
- def init_file_system(*args, remote_path:, pattern: /.*/, local_path: Settings.work_dir, most_recent_only: false, group_by: nil, most_recent_by: :create_time, **kargs, &block)
133
+ def init_file_system(*args, remote_path:, pattern: /.*/, local_path: Settings.work_dir, most_recent_only: false, group_by: nil, most_recent_by: :create_time, created_within: nil, **kargs, &block)
134
+
107
135
  @remote_path = Pathname.new(remote_path)
108
136
  @pattern = pattern
109
137
  @local_path = Pathname.new(local_path)
110
138
  @most_recent_only = most_recent_only
111
139
  @group_by = group_by
112
140
  @most_recent_by = most_recent_by
141
+ @created_within = created_within
142
+
113
143
  end
114
144
  end
115
145
  end
@@ -0,0 +1,140 @@
1
+ require 'google/apis/sheets_v4'
2
+ require 'google/apis/drive_v3'
3
+ require 'googleauth'
4
+ require 'googleauth/stores/file_token_store'
5
+ require 'googleauth/user_refresh'
6
+
7
+ module Remi
8
+
9
+ # Contains methods shared between Salesforce Extractor/Parser/Encoder/Loader
10
+ class Extractor::Gsheet < Extractor::FileSystem
11
+
12
+ def initialize(*args, **kargs, &block)
13
+ super
14
+ init_gsheet_extractor(*args, **kargs)
15
+ end
16
+
17
+ attr_reader :data
18
+ attr_reader :client_id
19
+ attr_reader :client_secret
20
+ attr_reader :access_token
21
+ attr_reader :ref_token
22
+ attr_reader :scope
23
+ attr_reader :expire_time
24
+
25
+ def authorize
26
+ credentials = Google::Auth::UserRefreshCredentials.new(
27
+ client_id: @client_id,
28
+ client_secret: @client_secret,
29
+ scope: @scope,
30
+ access_token: @access_token,
31
+ refresh_token: @refresh_token,
32
+ expires_at: @expiration_time / 1000
33
+ )
34
+ end
35
+
36
+
37
+ def get_file_list(folder_id)
38
+ service = Google::Apis::DriveV3::DriveService.new
39
+ service.client_options.application_name = @application_name
40
+ service.authorization = authorize()
41
+ response = service_list_files(service, folder_id)
42
+ response.files
43
+ end
44
+
45
+ def service_list_files(service, folder_id)
46
+ service.list_files(q: "'#{folder_id}' in parents", page_size: 10, order_by: 'createdTime desc', fields: 'nextPageToken, files(id, name, createdTime, mimeType)')
47
+ end
48
+
49
+ def get_spreadsheet_vals(service, spreadsheet_id)
50
+ service.get_spreadsheet_values(spreadsheet_id, 'Sheet1')
51
+ end
52
+
53
+ def extract
54
+ service = Google::Apis::SheetsV4::SheetsService.new
55
+ service.client_options.application_name = @application_name
56
+ service.authorization = authorize()
57
+ @data = []
58
+
59
+ entries.each do |file|
60
+ response = get_spreadsheet_vals(service, file.raw)
61
+ data.push(response)
62
+ end
63
+
64
+ self
65
+ end
66
+
67
+ # @return [Array<Extractor::FileSystemEntry>] (Memoized) list of objects in the bucket/prefix
68
+ def all_entries
69
+ @all_entries ||= all_entries!
70
+ end
71
+
72
+ # @return [Array<Extractor::FileSystemEntry>] (Memoized) list of objects in the bucket/prefix
73
+ def all_entries!
74
+ gsheet_entries = get_file_list(@default_folder_id)
75
+ gsheet_entries.map do |entry|
76
+ entry = entry.to_h
77
+ FileSystemEntry.new(
78
+ pathname: File.join(@default_folder_id, entry[:name]),
79
+ create_time: entry[:created_time],
80
+ modified_time: entry[:created_time],
81
+ raw: entry[:id]
82
+ )
83
+ end
84
+ end
85
+
86
+ private
87
+
88
+ def init_gsheet_extractor(*args, credentials:, folder_id:, **kargs)
89
+ @default_folder_id = folder_id
90
+ @oob_uri = 'urn:ietf:wg:oauth:2.0:oob'
91
+ @application_name = credentials.fetch(:application_name)
92
+
93
+ @client_secrets_path = File.join(
94
+ Dir.home,
95
+ '.credentials/client_secret.json'
96
+ )
97
+ @credentials_path = File.join(
98
+ Dir.home,
99
+ '.credentials/sheets.googleapis.com-ruby-remi.yaml'
100
+ )
101
+ @client_id = credentials.fetch(:client_id)
102
+ @access_token = credentials.fetch(:access_token)
103
+ @refresh_token = credentials.fetch(:refresh_token)
104
+ @client_secret = credentials.fetch(:client_secret)
105
+ @project_id = credentials.fetch(:project_id)
106
+ @scope = ["https://www.googleapis.com/auth/drive","https://www.googleapis.com/auth/spreadsheets"]
107
+ @expiration_time = Integer(credentials.fetch(:expiration_time))
108
+ end
109
+ end
110
+
111
+ class Parser::Gsheet < Parser
112
+
113
+ def parse(gs_extract)
114
+ google_vals = gs_extract.data
115
+ return_hash = nil
116
+ google_vals.each do |google_val|
117
+
118
+ if return_hash.nil?
119
+ return_hash = Hash.new
120
+ google_val.values[0].each do |header|
121
+ return_hash[field_symbolizer.call(header)] = []
122
+ end
123
+ end
124
+
125
+ keys_temp = return_hash.keys
126
+
127
+ google_val.values[1..-1].each do |rows|
128
+ col_num = 0
129
+
130
+ rows.each do |value|
131
+ return_hash[keys_temp[col_num]] << value
132
+ col_num +=1
133
+ end
134
+ end
135
+ end
136
+ Remi::DataFrame.create(:daru, return_hash, order: return_hash.keys)
137
+ end
138
+ end
139
+
140
+ end
@@ -61,6 +61,7 @@ module Remi
61
61
  logger.info "Downloading #{entry.name} to #{local_file}"
62
62
  retry_download { sftp.download!(File.join(@remote_path, entry.name), local_file) }
63
63
  local_file
64
+
64
65
  end
65
66
  end
66
67
  end
@@ -11,7 +11,8 @@ module Remi
11
11
  attr_accessor :sub_job, :data_subject
12
12
 
13
13
  def extract
14
- sub_job.job.send(data_subject).df
14
+ sub_job.execute unless sub_job.sub_job.send(data_subject).is_a? Remi::DataSource
15
+ sub_job.sub_job.send(data_subject).df
15
16
  end
16
17
 
17
18
  private
@@ -26,25 +27,33 @@ module Remi
26
27
  class Loader::SubJob < Loader
27
28
  # @param sub_job [Object] The name (relative to parent job) of the subjob to use
28
29
  # @param data_subject [Symbol] The name (relatvie to the sub job) of the sub job's data frame
30
+ # @param merge_fields [True,False] Indicates whether fields from the calling data subject
31
+ # should be merged with those defined in the sub job.
29
32
  def initialize(*args, **kargs, &block)
30
33
  super
31
34
  init_sub_job_loader(*args, **kargs, &block)
32
35
  end
33
36
 
34
- attr_accessor :sub_job, :data_subject
37
+ attr_accessor :sub_job, :data_subject, :merge_fields
35
38
 
36
39
  # @param data_frame [Object] Data frame to load to target sub job data subject
37
40
  # @return [true] On success
38
41
  def load(data_frame)
39
- sub_job.job.send(data_subject).df = data_frame
42
+ sub_job.sub_job.send(data_subject).df = data_frame
43
+ sub_job.sub_job.send(data_subject).fields.merge! fields if merge_fields
44
+ true
45
+ end
46
+
47
+ def autoload
40
48
  true
41
49
  end
42
50
 
43
51
  private
44
52
 
45
- def init_sub_job_loader(*args, sub_job:, data_subject:, **kargs, &block)
53
+ def init_sub_job_loader(*args, sub_job:, data_subject:, merge_fields: true, **kargs, &block)
46
54
  @sub_job = sub_job
47
55
  @data_subject = data_subject
56
+ @merge_fields = merge_fields
48
57
  end
49
58
  end
50
59
  end
data/lib/remi/encoder.rb CHANGED
@@ -38,7 +38,7 @@ module Remi
38
38
 
39
39
  # @return [Remi::Fields] The fields (uses the context fields if defined)
40
40
  def fields
41
- return context.fields if context if context.respond_to? :fields
41
+ return context.fields if context && context.respond_to?(:fields)
42
42
  @fields
43
43
  end
44
44
  end
data/lib/remi/job.rb CHANGED
@@ -271,7 +271,8 @@ module Remi
271
271
  " parameters: #{params.to_h.keys}\n" +
272
272
  " sources: #{sources}\n" +
273
273
  " targets: #{targets}\n" +
274
- " transforms: #{transforms}"
274
+ " transforms: #{transforms}\n" +
275
+ " sub_jobs: #{sub_jobs}"
275
276
  end
276
277
 
277
278
 
@@ -282,6 +283,7 @@ module Remi
282
283
  # @return [self]
283
284
  def execute(*components)
284
285
  execute_transforms if components.empty? || components.include?(:transforms)
286
+ execute_sub_jobs if components.empty? || components.include?(:sub_jobs)
285
287
  execute_load_targets if components.empty? || components.include?(:load_targets)
286
288
  self
287
289
  end
@@ -334,6 +336,12 @@ module Remi
334
336
  self
335
337
  end
336
338
 
339
+ # Executes all subjobs (not already executed)
340
+ def execute_sub_jobs
341
+ sub_jobs.each { |sj| send(sj).execute }
342
+ self
343
+ end
344
+
337
345
  # Adds all parameters listed to the job parameters
338
346
  def add_params(**kargs)
339
347
  kargs.each { |k,v| params[k] = v }
@@ -39,6 +39,7 @@ module Remi
39
39
  class Parameters
40
40
  def initialize(context=nil)
41
41
  @context = context
42
+ @params_methods = []
42
43
  @params = {}
43
44
  end
44
45
 
@@ -65,10 +66,13 @@ module Remi
65
66
  def []=(name, value)
66
67
  __define__(name) { value } unless respond_to? name
67
68
  @params[name] = value
69
+
70
+ value
68
71
  end
69
72
 
70
- # @return [Hash] The parameters as a hash
73
+ # @return [Hash] The evaluated parameters as a hash
71
74
  def to_h
75
+ @params_methods.each { |p| self.send(p) }
72
76
  @params
73
77
  end
74
78
 
@@ -76,13 +80,14 @@ module Remi
76
80
  def clone
77
81
  the_clone = super
78
82
  the_clone.instance_variable_set(:@params, @params.dup)
83
+ the_clone.instance_variable_set(:@params_methods, @params_methods.dup)
79
84
  the_clone
80
85
  end
81
86
 
82
87
  def __define__(name, &block)
83
- @params[name] = nil
88
+ @params_methods << name unless @params_methods.include? name
84
89
  define_singleton_method name do
85
- @params[name] ||= Remi::Dsl.dsl_return(self, @context, &block)
90
+ @params.fetch(name) { |name| @params[name] = Remi::Dsl.dsl_return(self, @context, &block) }
86
91
  end
87
92
  end
88
93
  end
@@ -10,25 +10,31 @@ module Remi
10
10
  attr_accessor :context, :name
11
11
 
12
12
  def dsl_return
13
- sub_job = Dsl.dsl_return(self, @context, &@block)
14
- raise ArgumentError, "SubJob DSL must return a Remi::Job" unless sub_job.is_a? Job
15
- sub_job
13
+ result = Dsl.dsl_return(self, @context, &@block)
14
+ raise ArgumentError, "SubJob DSL must return a Remi::Job" unless result.is_a? Job
15
+ result
16
16
  end
17
17
 
18
- def job
19
- @job ||= dsl_return
18
+ def sub_job
19
+ @sub_job ||= dsl_return
20
20
  end
21
21
 
22
22
  def fields(data_subject)
23
- job.send(data_subject).dsl_eval.fields
23
+ sub_job.send(data_subject).dsl_eval.fields
24
24
  end
25
25
 
26
26
  def execute
27
- job.execute
27
+ execute! unless @executed
28
+ end
29
+
30
+ def execute!
31
+ result = sub_job.execute
32
+ @executed = true
33
+ result
28
34
  end
29
35
 
30
36
  def execute_transforms
31
- job.execute(:transforms)
37
+ sub_job.execute(:transforms)
32
38
  end
33
39
  end
34
40
  end