remi 0.3.0 → 0.3.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.bundle/config +1 -1
- data/Gemfile +1 -0
- data/Gemfile.lock +45 -5
- data/README.md +245 -0
- data/features/step_definitions/remi_step.rb +16 -0
- data/jobs/sub_job_example_job.rb +5 -5
- data/lib/remi.rb +4 -1
- data/lib/remi/data_subject.rb +10 -1
- data/lib/remi/data_subjects/file_system.rb +31 -1
- data/lib/remi/data_subjects/gsheet.rb +140 -0
- data/lib/remi/data_subjects/sftp_file.rb +1 -0
- data/lib/remi/data_subjects/sub_job.rb +13 -4
- data/lib/remi/encoder.rb +1 -1
- data/lib/remi/job.rb +9 -1
- data/lib/remi/job/parameters.rb +8 -3
- data/lib/remi/job/sub_job.rb +14 -8
- data/lib/remi/loader.rb +14 -2
- data/lib/remi/testing/business_rules.rb +12 -9
- data/lib/remi/transform.rb +9 -0
- data/lib/remi/version.rb +1 -1
- data/spec/data_subject_spec.rb +23 -5
- data/spec/data_subjects/file_system_spec.rb +43 -9
- data/spec/data_subjects/gsheet_spec.rb +133 -0
- data/spec/data_subjects/sub_job_spec.rb +40 -8
- data/spec/job_spec.rb +58 -15
- metadata +5 -2
data/lib/remi.rb
CHANGED
@@ -4,6 +4,8 @@ File.expand_path(File.dirname(__FILE__)).tap {|pwd| $LOAD_PATH.unshift(pwd) unle
|
|
4
4
|
require 'yaml'
|
5
5
|
require 'json'
|
6
6
|
require 'tmpdir'
|
7
|
+
require 'fileutils'
|
8
|
+
|
7
9
|
|
8
10
|
# Gems
|
9
11
|
require 'daru'
|
@@ -56,10 +58,11 @@ require 'remi/loader'
|
|
56
58
|
require 'remi/data_subject'
|
57
59
|
require 'remi/data_subjects/file_system'
|
58
60
|
require 'remi/data_subjects/local_file'
|
61
|
+
#require 'remi/data_subjects/gsheet' # intentionally not included by default (must be optionally added)
|
59
62
|
require 'remi/data_subjects/sftp_file'
|
60
63
|
require 'remi/data_subjects/s3_file'
|
61
64
|
require 'remi/data_subjects/csv_file'
|
62
|
-
#require 'remi/data_subjects/salesforce' # intentionally not included by default
|
65
|
+
#require 'remi/data_subjects/salesforce' # intentionally not included by default (must be optionally added)
|
63
66
|
require 'remi/data_subjects/postgres'
|
64
67
|
require 'remi/data_subjects/data_frame'
|
65
68
|
require 'remi/data_subjects/none'
|
data/lib/remi/data_subject.rb
CHANGED
@@ -64,6 +64,7 @@ module Remi
|
|
64
64
|
|
65
65
|
# @return [Remi::DataFrame] the dataframe associated with this DataSubject
|
66
66
|
def df
|
67
|
+
dsl_eval
|
67
68
|
@dataframe ||= Remi::DataFrame.create(df_type, [], order: fields.keys)
|
68
69
|
end
|
69
70
|
|
@@ -71,6 +72,7 @@ module Remi
|
|
71
72
|
# @param new_dataframe [Object] The new dataframe object to be associated.
|
72
73
|
# @return [Remi::DataFrame] the associated dataframe
|
73
74
|
def df=(new_dataframe)
|
75
|
+
dsl_eval
|
74
76
|
if new_dataframe.respond_to? :df_type
|
75
77
|
@dataframe = new_dataframe
|
76
78
|
else
|
@@ -259,6 +261,7 @@ module Remi
|
|
259
261
|
# @param obj [Object] adds a loader object to the list of loaders
|
260
262
|
# @return [Array] the full list of loaders
|
261
263
|
def loader(obj)
|
264
|
+
obj.context = self
|
262
265
|
loaders << obj unless loaders.include? obj
|
263
266
|
end
|
264
267
|
|
@@ -269,7 +272,7 @@ module Remi
|
|
269
272
|
# @return [true] if successful
|
270
273
|
def load
|
271
274
|
return nil if @loaded || df.size == 0
|
272
|
-
dsl_eval
|
275
|
+
dsl_eval
|
273
276
|
|
274
277
|
load!
|
275
278
|
@loaded = true
|
@@ -284,6 +287,12 @@ module Remi
|
|
284
287
|
true
|
285
288
|
end
|
286
289
|
|
290
|
+
def df=(new_dataframe)
|
291
|
+
super
|
292
|
+
loaders.each { |l| l.load encoded_dataframe if l.autoload }
|
293
|
+
df
|
294
|
+
end
|
295
|
+
|
287
296
|
private
|
288
297
|
|
289
298
|
# @return [Object] the encoded data suitable for the loaders
|
@@ -50,6 +50,8 @@ module Remi
|
|
50
50
|
attr_reader :most_recent_only
|
51
51
|
attr_reader :group_by
|
52
52
|
attr_reader :most_recent_by
|
53
|
+
attr_reader :created_within
|
54
|
+
|
53
55
|
|
54
56
|
# Public: Called to extract files from the source filesystem.
|
55
57
|
#
|
@@ -70,6 +72,8 @@ module Remi
|
|
70
72
|
most_recent_matching_entry_in_group
|
71
73
|
elsif @most_recent_only
|
72
74
|
Array(most_recent_matching_entry)
|
75
|
+
elsif @created_within
|
76
|
+
get_created_within
|
73
77
|
else
|
74
78
|
matching_entries
|
75
79
|
end
|
@@ -83,6 +87,29 @@ module Remi
|
|
83
87
|
matching_entries.sort_by { |e| e.send(@most_recent_by) }.reverse.first
|
84
88
|
end
|
85
89
|
|
90
|
+
def get_created_within
|
91
|
+
|
92
|
+
if @most_recent_only
|
93
|
+
first_entry = matching_entries.sort_by { |e| e.send(@most_recent_by)}.reverse.first
|
94
|
+
if ((Date.today.to_time - Time.at(first_entry.create_time)) / 1.hour) < @created_within
|
95
|
+
Array(first_entry)
|
96
|
+
else
|
97
|
+
raise "No file Found. All files are older than #{@created_within} hrs"
|
98
|
+
end
|
99
|
+
else
|
100
|
+
entries_with_group = matching_entries.map do |entry|
|
101
|
+
if ((Time.new.to_time - Time.at(entry.create_time) ) / 1.seconds) < @created_within
|
102
|
+
entry
|
103
|
+
end
|
104
|
+
end.compact
|
105
|
+
if entries_with_group.length > 0
|
106
|
+
Array(entries_with_group)
|
107
|
+
else
|
108
|
+
raise "No files Found. All files are older than #{@created_within} hrs"
|
109
|
+
end
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
86
113
|
def most_recent_matching_entry_in_group
|
87
114
|
entries_with_group = matching_entries.map do |entry|
|
88
115
|
match = entry.name.match(@group_by)
|
@@ -103,13 +130,16 @@ module Remi
|
|
103
130
|
|
104
131
|
private
|
105
132
|
|
106
|
-
def init_file_system(*args, remote_path:, pattern: /.*/, local_path: Settings.work_dir, most_recent_only: false, group_by: nil, most_recent_by: :create_time, **kargs, &block)
|
133
|
+
def init_file_system(*args, remote_path:, pattern: /.*/, local_path: Settings.work_dir, most_recent_only: false, group_by: nil, most_recent_by: :create_time, created_within: nil, **kargs, &block)
|
134
|
+
|
107
135
|
@remote_path = Pathname.new(remote_path)
|
108
136
|
@pattern = pattern
|
109
137
|
@local_path = Pathname.new(local_path)
|
110
138
|
@most_recent_only = most_recent_only
|
111
139
|
@group_by = group_by
|
112
140
|
@most_recent_by = most_recent_by
|
141
|
+
@created_within = created_within
|
142
|
+
|
113
143
|
end
|
114
144
|
end
|
115
145
|
end
|
@@ -0,0 +1,140 @@
|
|
1
|
+
require 'google/apis/sheets_v4'
|
2
|
+
require 'google/apis/drive_v3'
|
3
|
+
require 'googleauth'
|
4
|
+
require 'googleauth/stores/file_token_store'
|
5
|
+
require 'googleauth/user_refresh'
|
6
|
+
|
7
|
+
module Remi
|
8
|
+
|
9
|
+
# Contains methods shared between Salesforce Extractor/Parser/Encoder/Loader
|
10
|
+
class Extractor::Gsheet < Extractor::FileSystem
|
11
|
+
|
12
|
+
def initialize(*args, **kargs, &block)
|
13
|
+
super
|
14
|
+
init_gsheet_extractor(*args, **kargs)
|
15
|
+
end
|
16
|
+
|
17
|
+
attr_reader :data
|
18
|
+
attr_reader :client_id
|
19
|
+
attr_reader :client_secret
|
20
|
+
attr_reader :access_token
|
21
|
+
attr_reader :ref_token
|
22
|
+
attr_reader :scope
|
23
|
+
attr_reader :expire_time
|
24
|
+
|
25
|
+
def authorize
|
26
|
+
credentials = Google::Auth::UserRefreshCredentials.new(
|
27
|
+
client_id: @client_id,
|
28
|
+
client_secret: @client_secret,
|
29
|
+
scope: @scope,
|
30
|
+
access_token: @access_token,
|
31
|
+
refresh_token: @refresh_token,
|
32
|
+
expires_at: @expiration_time / 1000
|
33
|
+
)
|
34
|
+
end
|
35
|
+
|
36
|
+
|
37
|
+
def get_file_list(folder_id)
|
38
|
+
service = Google::Apis::DriveV3::DriveService.new
|
39
|
+
service.client_options.application_name = @application_name
|
40
|
+
service.authorization = authorize()
|
41
|
+
response = service_list_files(service, folder_id)
|
42
|
+
response.files
|
43
|
+
end
|
44
|
+
|
45
|
+
def service_list_files(service, folder_id)
|
46
|
+
service.list_files(q: "'#{folder_id}' in parents", page_size: 10, order_by: 'createdTime desc', fields: 'nextPageToken, files(id, name, createdTime, mimeType)')
|
47
|
+
end
|
48
|
+
|
49
|
+
def get_spreadsheet_vals(service, spreadsheet_id)
|
50
|
+
service.get_spreadsheet_values(spreadsheet_id, 'Sheet1')
|
51
|
+
end
|
52
|
+
|
53
|
+
def extract
|
54
|
+
service = Google::Apis::SheetsV4::SheetsService.new
|
55
|
+
service.client_options.application_name = @application_name
|
56
|
+
service.authorization = authorize()
|
57
|
+
@data = []
|
58
|
+
|
59
|
+
entries.each do |file|
|
60
|
+
response = get_spreadsheet_vals(service, file.raw)
|
61
|
+
data.push(response)
|
62
|
+
end
|
63
|
+
|
64
|
+
self
|
65
|
+
end
|
66
|
+
|
67
|
+
# @return [Array<Extractor::FileSystemEntry>] (Memoized) list of objects in the bucket/prefix
|
68
|
+
def all_entries
|
69
|
+
@all_entries ||= all_entries!
|
70
|
+
end
|
71
|
+
|
72
|
+
# @return [Array<Extractor::FileSystemEntry>] (Memoized) list of objects in the bucket/prefix
|
73
|
+
def all_entries!
|
74
|
+
gsheet_entries = get_file_list(@default_folder_id)
|
75
|
+
gsheet_entries.map do |entry|
|
76
|
+
entry = entry.to_h
|
77
|
+
FileSystemEntry.new(
|
78
|
+
pathname: File.join(@default_folder_id, entry[:name]),
|
79
|
+
create_time: entry[:created_time],
|
80
|
+
modified_time: entry[:created_time],
|
81
|
+
raw: entry[:id]
|
82
|
+
)
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
private
|
87
|
+
|
88
|
+
def init_gsheet_extractor(*args, credentials:, folder_id:, **kargs)
|
89
|
+
@default_folder_id = folder_id
|
90
|
+
@oob_uri = 'urn:ietf:wg:oauth:2.0:oob'
|
91
|
+
@application_name = credentials.fetch(:application_name)
|
92
|
+
|
93
|
+
@client_secrets_path = File.join(
|
94
|
+
Dir.home,
|
95
|
+
'.credentials/client_secret.json'
|
96
|
+
)
|
97
|
+
@credentials_path = File.join(
|
98
|
+
Dir.home,
|
99
|
+
'.credentials/sheets.googleapis.com-ruby-remi.yaml'
|
100
|
+
)
|
101
|
+
@client_id = credentials.fetch(:client_id)
|
102
|
+
@access_token = credentials.fetch(:access_token)
|
103
|
+
@refresh_token = credentials.fetch(:refresh_token)
|
104
|
+
@client_secret = credentials.fetch(:client_secret)
|
105
|
+
@project_id = credentials.fetch(:project_id)
|
106
|
+
@scope = ["https://www.googleapis.com/auth/drive","https://www.googleapis.com/auth/spreadsheets"]
|
107
|
+
@expiration_time = Integer(credentials.fetch(:expiration_time))
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
class Parser::Gsheet < Parser
|
112
|
+
|
113
|
+
def parse(gs_extract)
|
114
|
+
google_vals = gs_extract.data
|
115
|
+
return_hash = nil
|
116
|
+
google_vals.each do |google_val|
|
117
|
+
|
118
|
+
if return_hash.nil?
|
119
|
+
return_hash = Hash.new
|
120
|
+
google_val.values[0].each do |header|
|
121
|
+
return_hash[field_symbolizer.call(header)] = []
|
122
|
+
end
|
123
|
+
end
|
124
|
+
|
125
|
+
keys_temp = return_hash.keys
|
126
|
+
|
127
|
+
google_val.values[1..-1].each do |rows|
|
128
|
+
col_num = 0
|
129
|
+
|
130
|
+
rows.each do |value|
|
131
|
+
return_hash[keys_temp[col_num]] << value
|
132
|
+
col_num +=1
|
133
|
+
end
|
134
|
+
end
|
135
|
+
end
|
136
|
+
Remi::DataFrame.create(:daru, return_hash, order: return_hash.keys)
|
137
|
+
end
|
138
|
+
end
|
139
|
+
|
140
|
+
end
|
@@ -11,7 +11,8 @@ module Remi
|
|
11
11
|
attr_accessor :sub_job, :data_subject
|
12
12
|
|
13
13
|
def extract
|
14
|
-
sub_job.
|
14
|
+
sub_job.execute unless sub_job.sub_job.send(data_subject).is_a? Remi::DataSource
|
15
|
+
sub_job.sub_job.send(data_subject).df
|
15
16
|
end
|
16
17
|
|
17
18
|
private
|
@@ -26,25 +27,33 @@ module Remi
|
|
26
27
|
class Loader::SubJob < Loader
|
27
28
|
# @param sub_job [Object] The name (relative to parent job) of the subjob to use
|
28
29
|
# @param data_subject [Symbol] The name (relatvie to the sub job) of the sub job's data frame
|
30
|
+
# @param merge_fields [True,False] Indicates whether fields from the calling data subject
|
31
|
+
# should be merged with those defined in the sub job.
|
29
32
|
def initialize(*args, **kargs, &block)
|
30
33
|
super
|
31
34
|
init_sub_job_loader(*args, **kargs, &block)
|
32
35
|
end
|
33
36
|
|
34
|
-
attr_accessor :sub_job, :data_subject
|
37
|
+
attr_accessor :sub_job, :data_subject, :merge_fields
|
35
38
|
|
36
39
|
# @param data_frame [Object] Data frame to load to target sub job data subject
|
37
40
|
# @return [true] On success
|
38
41
|
def load(data_frame)
|
39
|
-
sub_job.
|
42
|
+
sub_job.sub_job.send(data_subject).df = data_frame
|
43
|
+
sub_job.sub_job.send(data_subject).fields.merge! fields if merge_fields
|
44
|
+
true
|
45
|
+
end
|
46
|
+
|
47
|
+
def autoload
|
40
48
|
true
|
41
49
|
end
|
42
50
|
|
43
51
|
private
|
44
52
|
|
45
|
-
def init_sub_job_loader(*args, sub_job:, data_subject:, **kargs, &block)
|
53
|
+
def init_sub_job_loader(*args, sub_job:, data_subject:, merge_fields: true, **kargs, &block)
|
46
54
|
@sub_job = sub_job
|
47
55
|
@data_subject = data_subject
|
56
|
+
@merge_fields = merge_fields
|
48
57
|
end
|
49
58
|
end
|
50
59
|
end
|
data/lib/remi/encoder.rb
CHANGED
@@ -38,7 +38,7 @@ module Remi
|
|
38
38
|
|
39
39
|
# @return [Remi::Fields] The fields (uses the context fields if defined)
|
40
40
|
def fields
|
41
|
-
return context.fields if context
|
41
|
+
return context.fields if context && context.respond_to?(:fields)
|
42
42
|
@fields
|
43
43
|
end
|
44
44
|
end
|
data/lib/remi/job.rb
CHANGED
@@ -271,7 +271,8 @@ module Remi
|
|
271
271
|
" parameters: #{params.to_h.keys}\n" +
|
272
272
|
" sources: #{sources}\n" +
|
273
273
|
" targets: #{targets}\n" +
|
274
|
-
" transforms: #{transforms}"
|
274
|
+
" transforms: #{transforms}\n" +
|
275
|
+
" sub_jobs: #{sub_jobs}"
|
275
276
|
end
|
276
277
|
|
277
278
|
|
@@ -282,6 +283,7 @@ module Remi
|
|
282
283
|
# @return [self]
|
283
284
|
def execute(*components)
|
284
285
|
execute_transforms if components.empty? || components.include?(:transforms)
|
286
|
+
execute_sub_jobs if components.empty? || components.include?(:sub_jobs)
|
285
287
|
execute_load_targets if components.empty? || components.include?(:load_targets)
|
286
288
|
self
|
287
289
|
end
|
@@ -334,6 +336,12 @@ module Remi
|
|
334
336
|
self
|
335
337
|
end
|
336
338
|
|
339
|
+
# Executes all subjobs (not already executed)
|
340
|
+
def execute_sub_jobs
|
341
|
+
sub_jobs.each { |sj| send(sj).execute }
|
342
|
+
self
|
343
|
+
end
|
344
|
+
|
337
345
|
# Adds all parameters listed to the job parameters
|
338
346
|
def add_params(**kargs)
|
339
347
|
kargs.each { |k,v| params[k] = v }
|
data/lib/remi/job/parameters.rb
CHANGED
@@ -39,6 +39,7 @@ module Remi
|
|
39
39
|
class Parameters
|
40
40
|
def initialize(context=nil)
|
41
41
|
@context = context
|
42
|
+
@params_methods = []
|
42
43
|
@params = {}
|
43
44
|
end
|
44
45
|
|
@@ -65,10 +66,13 @@ module Remi
|
|
65
66
|
def []=(name, value)
|
66
67
|
__define__(name) { value } unless respond_to? name
|
67
68
|
@params[name] = value
|
69
|
+
|
70
|
+
value
|
68
71
|
end
|
69
72
|
|
70
|
-
# @return [Hash] The parameters as a hash
|
73
|
+
# @return [Hash] The evaluated parameters as a hash
|
71
74
|
def to_h
|
75
|
+
@params_methods.each { |p| self.send(p) }
|
72
76
|
@params
|
73
77
|
end
|
74
78
|
|
@@ -76,13 +80,14 @@ module Remi
|
|
76
80
|
def clone
|
77
81
|
the_clone = super
|
78
82
|
the_clone.instance_variable_set(:@params, @params.dup)
|
83
|
+
the_clone.instance_variable_set(:@params_methods, @params_methods.dup)
|
79
84
|
the_clone
|
80
85
|
end
|
81
86
|
|
82
87
|
def __define__(name, &block)
|
83
|
-
@
|
88
|
+
@params_methods << name unless @params_methods.include? name
|
84
89
|
define_singleton_method name do
|
85
|
-
@params[name]
|
90
|
+
@params.fetch(name) { |name| @params[name] = Remi::Dsl.dsl_return(self, @context, &block) }
|
86
91
|
end
|
87
92
|
end
|
88
93
|
end
|
data/lib/remi/job/sub_job.rb
CHANGED
@@ -10,25 +10,31 @@ module Remi
|
|
10
10
|
attr_accessor :context, :name
|
11
11
|
|
12
12
|
def dsl_return
|
13
|
-
|
14
|
-
raise ArgumentError, "SubJob DSL must return a Remi::Job" unless
|
15
|
-
|
13
|
+
result = Dsl.dsl_return(self, @context, &@block)
|
14
|
+
raise ArgumentError, "SubJob DSL must return a Remi::Job" unless result.is_a? Job
|
15
|
+
result
|
16
16
|
end
|
17
17
|
|
18
|
-
def
|
19
|
-
@
|
18
|
+
def sub_job
|
19
|
+
@sub_job ||= dsl_return
|
20
20
|
end
|
21
21
|
|
22
22
|
def fields(data_subject)
|
23
|
-
|
23
|
+
sub_job.send(data_subject).dsl_eval.fields
|
24
24
|
end
|
25
25
|
|
26
26
|
def execute
|
27
|
-
|
27
|
+
execute! unless @executed
|
28
|
+
end
|
29
|
+
|
30
|
+
def execute!
|
31
|
+
result = sub_job.execute
|
32
|
+
@executed = true
|
33
|
+
result
|
28
34
|
end
|
29
35
|
|
30
36
|
def execute_transforms
|
31
|
-
|
37
|
+
sub_job.execute(:transforms)
|
32
38
|
end
|
33
39
|
end
|
34
40
|
end
|