remi 0.3.0 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.bundle/config +1 -1
- data/Gemfile +1 -0
- data/Gemfile.lock +45 -5
- data/README.md +245 -0
- data/features/step_definitions/remi_step.rb +16 -0
- data/jobs/sub_job_example_job.rb +5 -5
- data/lib/remi.rb +4 -1
- data/lib/remi/data_subject.rb +10 -1
- data/lib/remi/data_subjects/file_system.rb +31 -1
- data/lib/remi/data_subjects/gsheet.rb +140 -0
- data/lib/remi/data_subjects/sftp_file.rb +1 -0
- data/lib/remi/data_subjects/sub_job.rb +13 -4
- data/lib/remi/encoder.rb +1 -1
- data/lib/remi/job.rb +9 -1
- data/lib/remi/job/parameters.rb +8 -3
- data/lib/remi/job/sub_job.rb +14 -8
- data/lib/remi/loader.rb +14 -2
- data/lib/remi/testing/business_rules.rb +12 -9
- data/lib/remi/transform.rb +9 -0
- data/lib/remi/version.rb +1 -1
- data/spec/data_subject_spec.rb +23 -5
- data/spec/data_subjects/file_system_spec.rb +43 -9
- data/spec/data_subjects/gsheet_spec.rb +133 -0
- data/spec/data_subjects/sub_job_spec.rb +40 -8
- data/spec/job_spec.rb +58 -15
- metadata +5 -2
data/lib/remi.rb
CHANGED
@@ -4,6 +4,8 @@ File.expand_path(File.dirname(__FILE__)).tap {|pwd| $LOAD_PATH.unshift(pwd) unle
|
|
4
4
|
require 'yaml'
|
5
5
|
require 'json'
|
6
6
|
require 'tmpdir'
|
7
|
+
require 'fileutils'
|
8
|
+
|
7
9
|
|
8
10
|
# Gems
|
9
11
|
require 'daru'
|
@@ -56,10 +58,11 @@ require 'remi/loader'
|
|
56
58
|
require 'remi/data_subject'
|
57
59
|
require 'remi/data_subjects/file_system'
|
58
60
|
require 'remi/data_subjects/local_file'
|
61
|
+
#require 'remi/data_subjects/gsheet' # intentionally not included by default (must be optionally added)
|
59
62
|
require 'remi/data_subjects/sftp_file'
|
60
63
|
require 'remi/data_subjects/s3_file'
|
61
64
|
require 'remi/data_subjects/csv_file'
|
62
|
-
#require 'remi/data_subjects/salesforce' # intentionally not included by default
|
65
|
+
#require 'remi/data_subjects/salesforce' # intentionally not included by default (must be optionally added)
|
63
66
|
require 'remi/data_subjects/postgres'
|
64
67
|
require 'remi/data_subjects/data_frame'
|
65
68
|
require 'remi/data_subjects/none'
|
data/lib/remi/data_subject.rb
CHANGED
@@ -64,6 +64,7 @@ module Remi
|
|
64
64
|
|
65
65
|
# @return [Remi::DataFrame] the dataframe associated with this DataSubject
|
66
66
|
def df
|
67
|
+
dsl_eval
|
67
68
|
@dataframe ||= Remi::DataFrame.create(df_type, [], order: fields.keys)
|
68
69
|
end
|
69
70
|
|
@@ -71,6 +72,7 @@ module Remi
|
|
71
72
|
# @param new_dataframe [Object] The new dataframe object to be associated.
|
72
73
|
# @return [Remi::DataFrame] the associated dataframe
|
73
74
|
def df=(new_dataframe)
|
75
|
+
dsl_eval
|
74
76
|
if new_dataframe.respond_to? :df_type
|
75
77
|
@dataframe = new_dataframe
|
76
78
|
else
|
@@ -259,6 +261,7 @@ module Remi
|
|
259
261
|
# @param obj [Object] adds a loader object to the list of loaders
|
260
262
|
# @return [Array] the full list of loaders
|
261
263
|
def loader(obj)
|
264
|
+
obj.context = self
|
262
265
|
loaders << obj unless loaders.include? obj
|
263
266
|
end
|
264
267
|
|
@@ -269,7 +272,7 @@ module Remi
|
|
269
272
|
# @return [true] if successful
|
270
273
|
def load
|
271
274
|
return nil if @loaded || df.size == 0
|
272
|
-
dsl_eval
|
275
|
+
dsl_eval
|
273
276
|
|
274
277
|
load!
|
275
278
|
@loaded = true
|
@@ -284,6 +287,12 @@ module Remi
|
|
284
287
|
true
|
285
288
|
end
|
286
289
|
|
290
|
+
def df=(new_dataframe)
|
291
|
+
super
|
292
|
+
loaders.each { |l| l.load encoded_dataframe if l.autoload }
|
293
|
+
df
|
294
|
+
end
|
295
|
+
|
287
296
|
private
|
288
297
|
|
289
298
|
# @return [Object] the encoded data suitable for the loaders
|
@@ -50,6 +50,8 @@ module Remi
|
|
50
50
|
attr_reader :most_recent_only
|
51
51
|
attr_reader :group_by
|
52
52
|
attr_reader :most_recent_by
|
53
|
+
attr_reader :created_within
|
54
|
+
|
53
55
|
|
54
56
|
# Public: Called to extract files from the source filesystem.
|
55
57
|
#
|
@@ -70,6 +72,8 @@ module Remi
|
|
70
72
|
most_recent_matching_entry_in_group
|
71
73
|
elsif @most_recent_only
|
72
74
|
Array(most_recent_matching_entry)
|
75
|
+
elsif @created_within
|
76
|
+
get_created_within
|
73
77
|
else
|
74
78
|
matching_entries
|
75
79
|
end
|
@@ -83,6 +87,29 @@ module Remi
|
|
83
87
|
matching_entries.sort_by { |e| e.send(@most_recent_by) }.reverse.first
|
84
88
|
end
|
85
89
|
|
90
|
+
def get_created_within
|
91
|
+
|
92
|
+
if @most_recent_only
|
93
|
+
first_entry = matching_entries.sort_by { |e| e.send(@most_recent_by)}.reverse.first
|
94
|
+
if ((Date.today.to_time - Time.at(first_entry.create_time)) / 1.hour) < @created_within
|
95
|
+
Array(first_entry)
|
96
|
+
else
|
97
|
+
raise "No file Found. All files are older than #{@created_within} hrs"
|
98
|
+
end
|
99
|
+
else
|
100
|
+
entries_with_group = matching_entries.map do |entry|
|
101
|
+
if ((Time.new.to_time - Time.at(entry.create_time) ) / 1.seconds) < @created_within
|
102
|
+
entry
|
103
|
+
end
|
104
|
+
end.compact
|
105
|
+
if entries_with_group.length > 0
|
106
|
+
Array(entries_with_group)
|
107
|
+
else
|
108
|
+
raise "No files Found. All files are older than #{@created_within} hrs"
|
109
|
+
end
|
110
|
+
end
|
111
|
+
end
|
112
|
+
|
86
113
|
def most_recent_matching_entry_in_group
|
87
114
|
entries_with_group = matching_entries.map do |entry|
|
88
115
|
match = entry.name.match(@group_by)
|
@@ -103,13 +130,16 @@ module Remi
|
|
103
130
|
|
104
131
|
private
|
105
132
|
|
106
|
-
def init_file_system(*args, remote_path:, pattern: /.*/, local_path: Settings.work_dir, most_recent_only: false, group_by: nil, most_recent_by: :create_time, **kargs, &block)
|
133
|
+
def init_file_system(*args, remote_path:, pattern: /.*/, local_path: Settings.work_dir, most_recent_only: false, group_by: nil, most_recent_by: :create_time, created_within: nil, **kargs, &block)
|
134
|
+
|
107
135
|
@remote_path = Pathname.new(remote_path)
|
108
136
|
@pattern = pattern
|
109
137
|
@local_path = Pathname.new(local_path)
|
110
138
|
@most_recent_only = most_recent_only
|
111
139
|
@group_by = group_by
|
112
140
|
@most_recent_by = most_recent_by
|
141
|
+
@created_within = created_within
|
142
|
+
|
113
143
|
end
|
114
144
|
end
|
115
145
|
end
|
@@ -0,0 +1,140 @@
|
|
1
|
+
require 'google/apis/sheets_v4'
|
2
|
+
require 'google/apis/drive_v3'
|
3
|
+
require 'googleauth'
|
4
|
+
require 'googleauth/stores/file_token_store'
|
5
|
+
require 'googleauth/user_refresh'
|
6
|
+
|
7
|
+
module Remi
|
8
|
+
|
9
|
+
# Contains methods shared between Salesforce Extractor/Parser/Encoder/Loader
|
10
|
+
class Extractor::Gsheet < Extractor::FileSystem
|
11
|
+
|
12
|
+
def initialize(*args, **kargs, &block)
|
13
|
+
super
|
14
|
+
init_gsheet_extractor(*args, **kargs)
|
15
|
+
end
|
16
|
+
|
17
|
+
attr_reader :data
|
18
|
+
attr_reader :client_id
|
19
|
+
attr_reader :client_secret
|
20
|
+
attr_reader :access_token
|
21
|
+
attr_reader :ref_token
|
22
|
+
attr_reader :scope
|
23
|
+
attr_reader :expire_time
|
24
|
+
|
25
|
+
def authorize
|
26
|
+
credentials = Google::Auth::UserRefreshCredentials.new(
|
27
|
+
client_id: @client_id,
|
28
|
+
client_secret: @client_secret,
|
29
|
+
scope: @scope,
|
30
|
+
access_token: @access_token,
|
31
|
+
refresh_token: @refresh_token,
|
32
|
+
expires_at: @expiration_time / 1000
|
33
|
+
)
|
34
|
+
end
|
35
|
+
|
36
|
+
|
37
|
+
def get_file_list(folder_id)
|
38
|
+
service = Google::Apis::DriveV3::DriveService.new
|
39
|
+
service.client_options.application_name = @application_name
|
40
|
+
service.authorization = authorize()
|
41
|
+
response = service_list_files(service, folder_id)
|
42
|
+
response.files
|
43
|
+
end
|
44
|
+
|
45
|
+
def service_list_files(service, folder_id)
|
46
|
+
service.list_files(q: "'#{folder_id}' in parents", page_size: 10, order_by: 'createdTime desc', fields: 'nextPageToken, files(id, name, createdTime, mimeType)')
|
47
|
+
end
|
48
|
+
|
49
|
+
def get_spreadsheet_vals(service, spreadsheet_id)
|
50
|
+
service.get_spreadsheet_values(spreadsheet_id, 'Sheet1')
|
51
|
+
end
|
52
|
+
|
53
|
+
def extract
|
54
|
+
service = Google::Apis::SheetsV4::SheetsService.new
|
55
|
+
service.client_options.application_name = @application_name
|
56
|
+
service.authorization = authorize()
|
57
|
+
@data = []
|
58
|
+
|
59
|
+
entries.each do |file|
|
60
|
+
response = get_spreadsheet_vals(service, file.raw)
|
61
|
+
data.push(response)
|
62
|
+
end
|
63
|
+
|
64
|
+
self
|
65
|
+
end
|
66
|
+
|
67
|
+
# @return [Array<Extractor::FileSystemEntry>] (Memoized) list of objects in the bucket/prefix
|
68
|
+
def all_entries
|
69
|
+
@all_entries ||= all_entries!
|
70
|
+
end
|
71
|
+
|
72
|
+
# @return [Array<Extractor::FileSystemEntry>] (Memoized) list of objects in the bucket/prefix
|
73
|
+
def all_entries!
|
74
|
+
gsheet_entries = get_file_list(@default_folder_id)
|
75
|
+
gsheet_entries.map do |entry|
|
76
|
+
entry = entry.to_h
|
77
|
+
FileSystemEntry.new(
|
78
|
+
pathname: File.join(@default_folder_id, entry[:name]),
|
79
|
+
create_time: entry[:created_time],
|
80
|
+
modified_time: entry[:created_time],
|
81
|
+
raw: entry[:id]
|
82
|
+
)
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
private
|
87
|
+
|
88
|
+
def init_gsheet_extractor(*args, credentials:, folder_id:, **kargs)
|
89
|
+
@default_folder_id = folder_id
|
90
|
+
@oob_uri = 'urn:ietf:wg:oauth:2.0:oob'
|
91
|
+
@application_name = credentials.fetch(:application_name)
|
92
|
+
|
93
|
+
@client_secrets_path = File.join(
|
94
|
+
Dir.home,
|
95
|
+
'.credentials/client_secret.json'
|
96
|
+
)
|
97
|
+
@credentials_path = File.join(
|
98
|
+
Dir.home,
|
99
|
+
'.credentials/sheets.googleapis.com-ruby-remi.yaml'
|
100
|
+
)
|
101
|
+
@client_id = credentials.fetch(:client_id)
|
102
|
+
@access_token = credentials.fetch(:access_token)
|
103
|
+
@refresh_token = credentials.fetch(:refresh_token)
|
104
|
+
@client_secret = credentials.fetch(:client_secret)
|
105
|
+
@project_id = credentials.fetch(:project_id)
|
106
|
+
@scope = ["https://www.googleapis.com/auth/drive","https://www.googleapis.com/auth/spreadsheets"]
|
107
|
+
@expiration_time = Integer(credentials.fetch(:expiration_time))
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
class Parser::Gsheet < Parser
|
112
|
+
|
113
|
+
def parse(gs_extract)
|
114
|
+
google_vals = gs_extract.data
|
115
|
+
return_hash = nil
|
116
|
+
google_vals.each do |google_val|
|
117
|
+
|
118
|
+
if return_hash.nil?
|
119
|
+
return_hash = Hash.new
|
120
|
+
google_val.values[0].each do |header|
|
121
|
+
return_hash[field_symbolizer.call(header)] = []
|
122
|
+
end
|
123
|
+
end
|
124
|
+
|
125
|
+
keys_temp = return_hash.keys
|
126
|
+
|
127
|
+
google_val.values[1..-1].each do |rows|
|
128
|
+
col_num = 0
|
129
|
+
|
130
|
+
rows.each do |value|
|
131
|
+
return_hash[keys_temp[col_num]] << value
|
132
|
+
col_num +=1
|
133
|
+
end
|
134
|
+
end
|
135
|
+
end
|
136
|
+
Remi::DataFrame.create(:daru, return_hash, order: return_hash.keys)
|
137
|
+
end
|
138
|
+
end
|
139
|
+
|
140
|
+
end
|
@@ -11,7 +11,8 @@ module Remi
|
|
11
11
|
attr_accessor :sub_job, :data_subject
|
12
12
|
|
13
13
|
def extract
|
14
|
-
sub_job.
|
14
|
+
sub_job.execute unless sub_job.sub_job.send(data_subject).is_a? Remi::DataSource
|
15
|
+
sub_job.sub_job.send(data_subject).df
|
15
16
|
end
|
16
17
|
|
17
18
|
private
|
@@ -26,25 +27,33 @@ module Remi
|
|
26
27
|
class Loader::SubJob < Loader
|
27
28
|
# @param sub_job [Object] The name (relative to parent job) of the subjob to use
|
28
29
|
# @param data_subject [Symbol] The name (relatvie to the sub job) of the sub job's data frame
|
30
|
+
# @param merge_fields [True,False] Indicates whether fields from the calling data subject
|
31
|
+
# should be merged with those defined in the sub job.
|
29
32
|
def initialize(*args, **kargs, &block)
|
30
33
|
super
|
31
34
|
init_sub_job_loader(*args, **kargs, &block)
|
32
35
|
end
|
33
36
|
|
34
|
-
attr_accessor :sub_job, :data_subject
|
37
|
+
attr_accessor :sub_job, :data_subject, :merge_fields
|
35
38
|
|
36
39
|
# @param data_frame [Object] Data frame to load to target sub job data subject
|
37
40
|
# @return [true] On success
|
38
41
|
def load(data_frame)
|
39
|
-
sub_job.
|
42
|
+
sub_job.sub_job.send(data_subject).df = data_frame
|
43
|
+
sub_job.sub_job.send(data_subject).fields.merge! fields if merge_fields
|
44
|
+
true
|
45
|
+
end
|
46
|
+
|
47
|
+
def autoload
|
40
48
|
true
|
41
49
|
end
|
42
50
|
|
43
51
|
private
|
44
52
|
|
45
|
-
def init_sub_job_loader(*args, sub_job:, data_subject:, **kargs, &block)
|
53
|
+
def init_sub_job_loader(*args, sub_job:, data_subject:, merge_fields: true, **kargs, &block)
|
46
54
|
@sub_job = sub_job
|
47
55
|
@data_subject = data_subject
|
56
|
+
@merge_fields = merge_fields
|
48
57
|
end
|
49
58
|
end
|
50
59
|
end
|
data/lib/remi/encoder.rb
CHANGED
@@ -38,7 +38,7 @@ module Remi
|
|
38
38
|
|
39
39
|
# @return [Remi::Fields] The fields (uses the context fields if defined)
|
40
40
|
def fields
|
41
|
-
return context.fields if context
|
41
|
+
return context.fields if context && context.respond_to?(:fields)
|
42
42
|
@fields
|
43
43
|
end
|
44
44
|
end
|
data/lib/remi/job.rb
CHANGED
@@ -271,7 +271,8 @@ module Remi
|
|
271
271
|
" parameters: #{params.to_h.keys}\n" +
|
272
272
|
" sources: #{sources}\n" +
|
273
273
|
" targets: #{targets}\n" +
|
274
|
-
" transforms: #{transforms}"
|
274
|
+
" transforms: #{transforms}\n" +
|
275
|
+
" sub_jobs: #{sub_jobs}"
|
275
276
|
end
|
276
277
|
|
277
278
|
|
@@ -282,6 +283,7 @@ module Remi
|
|
282
283
|
# @return [self]
|
283
284
|
def execute(*components)
|
284
285
|
execute_transforms if components.empty? || components.include?(:transforms)
|
286
|
+
execute_sub_jobs if components.empty? || components.include?(:sub_jobs)
|
285
287
|
execute_load_targets if components.empty? || components.include?(:load_targets)
|
286
288
|
self
|
287
289
|
end
|
@@ -334,6 +336,12 @@ module Remi
|
|
334
336
|
self
|
335
337
|
end
|
336
338
|
|
339
|
+
# Executes all subjobs (not already executed)
|
340
|
+
def execute_sub_jobs
|
341
|
+
sub_jobs.each { |sj| send(sj).execute }
|
342
|
+
self
|
343
|
+
end
|
344
|
+
|
337
345
|
# Adds all parameters listed to the job parameters
|
338
346
|
def add_params(**kargs)
|
339
347
|
kargs.each { |k,v| params[k] = v }
|
data/lib/remi/job/parameters.rb
CHANGED
@@ -39,6 +39,7 @@ module Remi
|
|
39
39
|
class Parameters
|
40
40
|
def initialize(context=nil)
|
41
41
|
@context = context
|
42
|
+
@params_methods = []
|
42
43
|
@params = {}
|
43
44
|
end
|
44
45
|
|
@@ -65,10 +66,13 @@ module Remi
|
|
65
66
|
def []=(name, value)
|
66
67
|
__define__(name) { value } unless respond_to? name
|
67
68
|
@params[name] = value
|
69
|
+
|
70
|
+
value
|
68
71
|
end
|
69
72
|
|
70
|
-
# @return [Hash] The parameters as a hash
|
73
|
+
# @return [Hash] The evaluated parameters as a hash
|
71
74
|
def to_h
|
75
|
+
@params_methods.each { |p| self.send(p) }
|
72
76
|
@params
|
73
77
|
end
|
74
78
|
|
@@ -76,13 +80,14 @@ module Remi
|
|
76
80
|
def clone
|
77
81
|
the_clone = super
|
78
82
|
the_clone.instance_variable_set(:@params, @params.dup)
|
83
|
+
the_clone.instance_variable_set(:@params_methods, @params_methods.dup)
|
79
84
|
the_clone
|
80
85
|
end
|
81
86
|
|
82
87
|
def __define__(name, &block)
|
83
|
-
@
|
88
|
+
@params_methods << name unless @params_methods.include? name
|
84
89
|
define_singleton_method name do
|
85
|
-
@params[name]
|
90
|
+
@params.fetch(name) { |name| @params[name] = Remi::Dsl.dsl_return(self, @context, &block) }
|
86
91
|
end
|
87
92
|
end
|
88
93
|
end
|
data/lib/remi/job/sub_job.rb
CHANGED
@@ -10,25 +10,31 @@ module Remi
|
|
10
10
|
attr_accessor :context, :name
|
11
11
|
|
12
12
|
def dsl_return
|
13
|
-
|
14
|
-
raise ArgumentError, "SubJob DSL must return a Remi::Job" unless
|
15
|
-
|
13
|
+
result = Dsl.dsl_return(self, @context, &@block)
|
14
|
+
raise ArgumentError, "SubJob DSL must return a Remi::Job" unless result.is_a? Job
|
15
|
+
result
|
16
16
|
end
|
17
17
|
|
18
|
-
def
|
19
|
-
@
|
18
|
+
def sub_job
|
19
|
+
@sub_job ||= dsl_return
|
20
20
|
end
|
21
21
|
|
22
22
|
def fields(data_subject)
|
23
|
-
|
23
|
+
sub_job.send(data_subject).dsl_eval.fields
|
24
24
|
end
|
25
25
|
|
26
26
|
def execute
|
27
|
-
|
27
|
+
execute! unless @executed
|
28
|
+
end
|
29
|
+
|
30
|
+
def execute!
|
31
|
+
result = sub_job.execute
|
32
|
+
@executed = true
|
33
|
+
result
|
28
34
|
end
|
29
35
|
|
30
36
|
def execute_transforms
|
31
|
-
|
37
|
+
sub_job.execute(:transforms)
|
32
38
|
end
|
33
39
|
end
|
34
40
|
end
|