remi 0.3.0 → 0.3.1

Sign up to get free protection for your applications and to get access to all the features.
data/lib/remi.rb CHANGED
@@ -4,6 +4,8 @@ File.expand_path(File.dirname(__FILE__)).tap {|pwd| $LOAD_PATH.unshift(pwd) unle
4
4
  require 'yaml'
5
5
  require 'json'
6
6
  require 'tmpdir'
7
+ require 'fileutils'
8
+
7
9
 
8
10
  # Gems
9
11
  require 'daru'
@@ -56,10 +58,11 @@ require 'remi/loader'
56
58
  require 'remi/data_subject'
57
59
  require 'remi/data_subjects/file_system'
58
60
  require 'remi/data_subjects/local_file'
61
+ #require 'remi/data_subjects/gsheet' # intentionally not included by default (must be optionally added)
59
62
  require 'remi/data_subjects/sftp_file'
60
63
  require 'remi/data_subjects/s3_file'
61
64
  require 'remi/data_subjects/csv_file'
62
- #require 'remi/data_subjects/salesforce' # intentionally not included by default
65
+ #require 'remi/data_subjects/salesforce' # intentionally not included by default (must be optionally added)
63
66
  require 'remi/data_subjects/postgres'
64
67
  require 'remi/data_subjects/data_frame'
65
68
  require 'remi/data_subjects/none'
@@ -64,6 +64,7 @@ module Remi
64
64
 
65
65
  # @return [Remi::DataFrame] the dataframe associated with this DataSubject
66
66
  def df
67
+ dsl_eval
67
68
  @dataframe ||= Remi::DataFrame.create(df_type, [], order: fields.keys)
68
69
  end
69
70
 
@@ -71,6 +72,7 @@ module Remi
71
72
  # @param new_dataframe [Object] The new dataframe object to be associated.
72
73
  # @return [Remi::DataFrame] the associated dataframe
73
74
  def df=(new_dataframe)
75
+ dsl_eval
74
76
  if new_dataframe.respond_to? :df_type
75
77
  @dataframe = new_dataframe
76
78
  else
@@ -259,6 +261,7 @@ module Remi
259
261
  # @param obj [Object] adds a loader object to the list of loaders
260
262
  # @return [Array] the full list of loaders
261
263
  def loader(obj)
264
+ obj.context = self
262
265
  loaders << obj unless loaders.include? obj
263
266
  end
264
267
 
@@ -269,7 +272,7 @@ module Remi
269
272
  # @return [true] if successful
270
273
  def load
271
274
  return nil if @loaded || df.size == 0
272
- dsl_eval if @block
275
+ dsl_eval
273
276
 
274
277
  load!
275
278
  @loaded = true
@@ -284,6 +287,12 @@ module Remi
284
287
  true
285
288
  end
286
289
 
290
+ def df=(new_dataframe)
291
+ super
292
+ loaders.each { |l| l.load encoded_dataframe if l.autoload }
293
+ df
294
+ end
295
+
287
296
  private
288
297
 
289
298
  # @return [Object] the encoded data suitable for the loaders
@@ -50,6 +50,8 @@ module Remi
50
50
  attr_reader :most_recent_only
51
51
  attr_reader :group_by
52
52
  attr_reader :most_recent_by
53
+ attr_reader :created_within
54
+
53
55
 
54
56
  # Public: Called to extract files from the source filesystem.
55
57
  #
@@ -70,6 +72,8 @@ module Remi
70
72
  most_recent_matching_entry_in_group
71
73
  elsif @most_recent_only
72
74
  Array(most_recent_matching_entry)
75
+ elsif @created_within
76
+ get_created_within
73
77
  else
74
78
  matching_entries
75
79
  end
@@ -83,6 +87,29 @@ module Remi
83
87
  matching_entries.sort_by { |e| e.send(@most_recent_by) }.reverse.first
84
88
  end
85
89
 
90
+ def get_created_within
91
+
92
+ if @most_recent_only
93
+ first_entry = matching_entries.sort_by { |e| e.send(@most_recent_by)}.reverse.first
94
+ if ((Date.today.to_time - Time.at(first_entry.create_time)) / 1.hour) < @created_within
95
+ Array(first_entry)
96
+ else
97
+ raise "No file Found. All files are older than #{@created_within} hrs"
98
+ end
99
+ else
100
+ entries_with_group = matching_entries.map do |entry|
101
+ if ((Time.new.to_time - Time.at(entry.create_time) ) / 1.seconds) < @created_within
102
+ entry
103
+ end
104
+ end.compact
105
+ if entries_with_group.length > 0
106
+ Array(entries_with_group)
107
+ else
108
+ raise "No files Found. All files are older than #{@created_within} hrs"
109
+ end
110
+ end
111
+ end
112
+
86
113
  def most_recent_matching_entry_in_group
87
114
  entries_with_group = matching_entries.map do |entry|
88
115
  match = entry.name.match(@group_by)
@@ -103,13 +130,16 @@ module Remi
103
130
 
104
131
  private
105
132
 
106
- def init_file_system(*args, remote_path:, pattern: /.*/, local_path: Settings.work_dir, most_recent_only: false, group_by: nil, most_recent_by: :create_time, **kargs, &block)
133
+ def init_file_system(*args, remote_path:, pattern: /.*/, local_path: Settings.work_dir, most_recent_only: false, group_by: nil, most_recent_by: :create_time, created_within: nil, **kargs, &block)
134
+
107
135
  @remote_path = Pathname.new(remote_path)
108
136
  @pattern = pattern
109
137
  @local_path = Pathname.new(local_path)
110
138
  @most_recent_only = most_recent_only
111
139
  @group_by = group_by
112
140
  @most_recent_by = most_recent_by
141
+ @created_within = created_within
142
+
113
143
  end
114
144
  end
115
145
  end
@@ -0,0 +1,140 @@
1
+ require 'google/apis/sheets_v4'
2
+ require 'google/apis/drive_v3'
3
+ require 'googleauth'
4
+ require 'googleauth/stores/file_token_store'
5
+ require 'googleauth/user_refresh'
6
+
7
+ module Remi
8
+
9
+ # Contains methods shared between Salesforce Extractor/Parser/Encoder/Loader
10
+ class Extractor::Gsheet < Extractor::FileSystem
11
+
12
+ def initialize(*args, **kargs, &block)
13
+ super
14
+ init_gsheet_extractor(*args, **kargs)
15
+ end
16
+
17
+ attr_reader :data
18
+ attr_reader :client_id
19
+ attr_reader :client_secret
20
+ attr_reader :access_token
21
+ attr_reader :ref_token
22
+ attr_reader :scope
23
+ attr_reader :expire_time
24
+
25
+ def authorize
26
+ credentials = Google::Auth::UserRefreshCredentials.new(
27
+ client_id: @client_id,
28
+ client_secret: @client_secret,
29
+ scope: @scope,
30
+ access_token: @access_token,
31
+ refresh_token: @refresh_token,
32
+ expires_at: @expiration_time / 1000
33
+ )
34
+ end
35
+
36
+
37
+ def get_file_list(folder_id)
38
+ service = Google::Apis::DriveV3::DriveService.new
39
+ service.client_options.application_name = @application_name
40
+ service.authorization = authorize()
41
+ response = service_list_files(service, folder_id)
42
+ response.files
43
+ end
44
+
45
+ def service_list_files(service, folder_id)
46
+ service.list_files(q: "'#{folder_id}' in parents", page_size: 10, order_by: 'createdTime desc', fields: 'nextPageToken, files(id, name, createdTime, mimeType)')
47
+ end
48
+
49
+ def get_spreadsheet_vals(service, spreadsheet_id)
50
+ service.get_spreadsheet_values(spreadsheet_id, 'Sheet1')
51
+ end
52
+
53
+ def extract
54
+ service = Google::Apis::SheetsV4::SheetsService.new
55
+ service.client_options.application_name = @application_name
56
+ service.authorization = authorize()
57
+ @data = []
58
+
59
+ entries.each do |file|
60
+ response = get_spreadsheet_vals(service, file.raw)
61
+ data.push(response)
62
+ end
63
+
64
+ self
65
+ end
66
+
67
+ # @return [Array<Extractor::FileSystemEntry>] (Memoized) list of objects in the bucket/prefix
68
+ def all_entries
69
+ @all_entries ||= all_entries!
70
+ end
71
+
72
+ # @return [Array<Extractor::FileSystemEntry>] (Memoized) list of objects in the bucket/prefix
73
+ def all_entries!
74
+ gsheet_entries = get_file_list(@default_folder_id)
75
+ gsheet_entries.map do |entry|
76
+ entry = entry.to_h
77
+ FileSystemEntry.new(
78
+ pathname: File.join(@default_folder_id, entry[:name]),
79
+ create_time: entry[:created_time],
80
+ modified_time: entry[:created_time],
81
+ raw: entry[:id]
82
+ )
83
+ end
84
+ end
85
+
86
+ private
87
+
88
+ def init_gsheet_extractor(*args, credentials:, folder_id:, **kargs)
89
+ @default_folder_id = folder_id
90
+ @oob_uri = 'urn:ietf:wg:oauth:2.0:oob'
91
+ @application_name = credentials.fetch(:application_name)
92
+
93
+ @client_secrets_path = File.join(
94
+ Dir.home,
95
+ '.credentials/client_secret.json'
96
+ )
97
+ @credentials_path = File.join(
98
+ Dir.home,
99
+ '.credentials/sheets.googleapis.com-ruby-remi.yaml'
100
+ )
101
+ @client_id = credentials.fetch(:client_id)
102
+ @access_token = credentials.fetch(:access_token)
103
+ @refresh_token = credentials.fetch(:refresh_token)
104
+ @client_secret = credentials.fetch(:client_secret)
105
+ @project_id = credentials.fetch(:project_id)
106
+ @scope = ["https://www.googleapis.com/auth/drive","https://www.googleapis.com/auth/spreadsheets"]
107
+ @expiration_time = Integer(credentials.fetch(:expiration_time))
108
+ end
109
+ end
110
+
111
+ class Parser::Gsheet < Parser
112
+
113
+ def parse(gs_extract)
114
+ google_vals = gs_extract.data
115
+ return_hash = nil
116
+ google_vals.each do |google_val|
117
+
118
+ if return_hash.nil?
119
+ return_hash = Hash.new
120
+ google_val.values[0].each do |header|
121
+ return_hash[field_symbolizer.call(header)] = []
122
+ end
123
+ end
124
+
125
+ keys_temp = return_hash.keys
126
+
127
+ google_val.values[1..-1].each do |rows|
128
+ col_num = 0
129
+
130
+ rows.each do |value|
131
+ return_hash[keys_temp[col_num]] << value
132
+ col_num +=1
133
+ end
134
+ end
135
+ end
136
+ Remi::DataFrame.create(:daru, return_hash, order: return_hash.keys)
137
+ end
138
+ end
139
+
140
+ end
@@ -61,6 +61,7 @@ module Remi
61
61
  logger.info "Downloading #{entry.name} to #{local_file}"
62
62
  retry_download { sftp.download!(File.join(@remote_path, entry.name), local_file) }
63
63
  local_file
64
+
64
65
  end
65
66
  end
66
67
  end
@@ -11,7 +11,8 @@ module Remi
11
11
  attr_accessor :sub_job, :data_subject
12
12
 
13
13
  def extract
14
- sub_job.job.send(data_subject).df
14
+ sub_job.execute unless sub_job.sub_job.send(data_subject).is_a? Remi::DataSource
15
+ sub_job.sub_job.send(data_subject).df
15
16
  end
16
17
 
17
18
  private
@@ -26,25 +27,33 @@ module Remi
26
27
  class Loader::SubJob < Loader
27
28
  # @param sub_job [Object] The name (relative to parent job) of the subjob to use
28
29
  # @param data_subject [Symbol] The name (relatvie to the sub job) of the sub job's data frame
30
+ # @param merge_fields [True,False] Indicates whether fields from the calling data subject
31
+ # should be merged with those defined in the sub job.
29
32
  def initialize(*args, **kargs, &block)
30
33
  super
31
34
  init_sub_job_loader(*args, **kargs, &block)
32
35
  end
33
36
 
34
- attr_accessor :sub_job, :data_subject
37
+ attr_accessor :sub_job, :data_subject, :merge_fields
35
38
 
36
39
  # @param data_frame [Object] Data frame to load to target sub job data subject
37
40
  # @return [true] On success
38
41
  def load(data_frame)
39
- sub_job.job.send(data_subject).df = data_frame
42
+ sub_job.sub_job.send(data_subject).df = data_frame
43
+ sub_job.sub_job.send(data_subject).fields.merge! fields if merge_fields
44
+ true
45
+ end
46
+
47
+ def autoload
40
48
  true
41
49
  end
42
50
 
43
51
  private
44
52
 
45
- def init_sub_job_loader(*args, sub_job:, data_subject:, **kargs, &block)
53
+ def init_sub_job_loader(*args, sub_job:, data_subject:, merge_fields: true, **kargs, &block)
46
54
  @sub_job = sub_job
47
55
  @data_subject = data_subject
56
+ @merge_fields = merge_fields
48
57
  end
49
58
  end
50
59
  end
data/lib/remi/encoder.rb CHANGED
@@ -38,7 +38,7 @@ module Remi
38
38
 
39
39
  # @return [Remi::Fields] The fields (uses the context fields if defined)
40
40
  def fields
41
- return context.fields if context if context.respond_to? :fields
41
+ return context.fields if context && context.respond_to?(:fields)
42
42
  @fields
43
43
  end
44
44
  end
data/lib/remi/job.rb CHANGED
@@ -271,7 +271,8 @@ module Remi
271
271
  " parameters: #{params.to_h.keys}\n" +
272
272
  " sources: #{sources}\n" +
273
273
  " targets: #{targets}\n" +
274
- " transforms: #{transforms}"
274
+ " transforms: #{transforms}\n" +
275
+ " sub_jobs: #{sub_jobs}"
275
276
  end
276
277
 
277
278
 
@@ -282,6 +283,7 @@ module Remi
282
283
  # @return [self]
283
284
  def execute(*components)
284
285
  execute_transforms if components.empty? || components.include?(:transforms)
286
+ execute_sub_jobs if components.empty? || components.include?(:sub_jobs)
285
287
  execute_load_targets if components.empty? || components.include?(:load_targets)
286
288
  self
287
289
  end
@@ -334,6 +336,12 @@ module Remi
334
336
  self
335
337
  end
336
338
 
339
+ # Executes all subjobs (not already executed)
340
+ def execute_sub_jobs
341
+ sub_jobs.each { |sj| send(sj).execute }
342
+ self
343
+ end
344
+
337
345
  # Adds all parameters listed to the job parameters
338
346
  def add_params(**kargs)
339
347
  kargs.each { |k,v| params[k] = v }
@@ -39,6 +39,7 @@ module Remi
39
39
  class Parameters
40
40
  def initialize(context=nil)
41
41
  @context = context
42
+ @params_methods = []
42
43
  @params = {}
43
44
  end
44
45
 
@@ -65,10 +66,13 @@ module Remi
65
66
  def []=(name, value)
66
67
  __define__(name) { value } unless respond_to? name
67
68
  @params[name] = value
69
+
70
+ value
68
71
  end
69
72
 
70
- # @return [Hash] The parameters as a hash
73
+ # @return [Hash] The evaluated parameters as a hash
71
74
  def to_h
75
+ @params_methods.each { |p| self.send(p) }
72
76
  @params
73
77
  end
74
78
 
@@ -76,13 +80,14 @@ module Remi
76
80
  def clone
77
81
  the_clone = super
78
82
  the_clone.instance_variable_set(:@params, @params.dup)
83
+ the_clone.instance_variable_set(:@params_methods, @params_methods.dup)
79
84
  the_clone
80
85
  end
81
86
 
82
87
  def __define__(name, &block)
83
- @params[name] = nil
88
+ @params_methods << name unless @params_methods.include? name
84
89
  define_singleton_method name do
85
- @params[name] ||= Remi::Dsl.dsl_return(self, @context, &block)
90
+ @params.fetch(name) { |name| @params[name] = Remi::Dsl.dsl_return(self, @context, &block) }
86
91
  end
87
92
  end
88
93
  end
@@ -10,25 +10,31 @@ module Remi
10
10
  attr_accessor :context, :name
11
11
 
12
12
  def dsl_return
13
- sub_job = Dsl.dsl_return(self, @context, &@block)
14
- raise ArgumentError, "SubJob DSL must return a Remi::Job" unless sub_job.is_a? Job
15
- sub_job
13
+ result = Dsl.dsl_return(self, @context, &@block)
14
+ raise ArgumentError, "SubJob DSL must return a Remi::Job" unless result.is_a? Job
15
+ result
16
16
  end
17
17
 
18
- def job
19
- @job ||= dsl_return
18
+ def sub_job
19
+ @sub_job ||= dsl_return
20
20
  end
21
21
 
22
22
  def fields(data_subject)
23
- job.send(data_subject).dsl_eval.fields
23
+ sub_job.send(data_subject).dsl_eval.fields
24
24
  end
25
25
 
26
26
  def execute
27
- job.execute
27
+ execute! unless @executed
28
+ end
29
+
30
+ def execute!
31
+ result = sub_job.execute
32
+ @executed = true
33
+ result
28
34
  end
29
35
 
30
36
  def execute_transforms
31
- job.execute(:transforms)
37
+ sub_job.execute(:transforms)
32
38
  end
33
39
  end
34
40
  end