remi 0.2.42 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.yardopts +7 -0
- data/Gemfile +1 -1
- data/Gemfile.lock +13 -26
- data/README.md +1 -1
- data/features/step_definitions/remi_step.rb +33 -13
- data/features/sub_job_example.feature +24 -0
- data/features/sub_transform_example.feature +35 -0
- data/features/sub_transform_many_to_many.feature +49 -0
- data/features/support/env_app.rb +1 -1
- data/jobs/all_jobs_shared.rb +19 -16
- data/jobs/copy_source_job.rb +11 -9
- data/jobs/csv_file_target_job.rb +10 -9
- data/jobs/json_job.rb +18 -14
- data/jobs/metadata_job.rb +33 -28
- data/jobs/parameters_job.rb +14 -11
- data/jobs/sample_job.rb +106 -77
- data/jobs/sftp_file_target_job.rb +14 -13
- data/jobs/sub_job_example_job.rb +86 -0
- data/jobs/sub_transform_example_job.rb +43 -0
- data/jobs/sub_transform_many_to_many_job.rb +46 -0
- data/jobs/transforms/concatenate_job.rb +16 -12
- data/jobs/transforms/data_frame_sieve_job.rb +24 -19
- data/jobs/transforms/date_diff_job.rb +15 -11
- data/jobs/transforms/nvl_job.rb +16 -12
- data/jobs/transforms/parse_date_job.rb +17 -14
- data/jobs/transforms/partitioner_job.rb +27 -19
- data/jobs/transforms/prefix_job.rb +13 -10
- data/jobs/transforms/truncate_job.rb +14 -10
- data/jobs/transforms/truthy_job.rb +11 -8
- data/lib/remi.rb +25 -11
- data/lib/remi/data_frame.rb +4 -4
- data/lib/remi/data_frame/daru.rb +1 -37
- data/lib/remi/data_subject.rb +234 -48
- data/lib/remi/data_subjects/csv_file.rb +171 -0
- data/lib/remi/data_subjects/data_frame.rb +106 -0
- data/lib/remi/data_subjects/file_system.rb +115 -0
- data/lib/remi/data_subjects/local_file.rb +109 -0
- data/lib/remi/data_subjects/none.rb +31 -0
- data/lib/remi/data_subjects/postgres.rb +186 -0
- data/lib/remi/data_subjects/s3_file.rb +84 -0
- data/lib/remi/data_subjects/salesforce.rb +211 -0
- data/lib/remi/data_subjects/sftp_file.rb +196 -0
- data/lib/remi/data_subjects/sub_job.rb +50 -0
- data/lib/remi/dsl.rb +74 -0
- data/lib/remi/encoder.rb +45 -0
- data/lib/remi/extractor.rb +21 -0
- data/lib/remi/field_symbolizers.rb +1 -0
- data/lib/remi/job.rb +279 -113
- data/lib/remi/job/parameters.rb +90 -0
- data/lib/remi/job/sub_job.rb +35 -0
- data/lib/remi/job/transform.rb +165 -0
- data/lib/remi/loader.rb +22 -0
- data/lib/remi/monkeys/daru.rb +4 -0
- data/lib/remi/parser.rb +44 -0
- data/lib/remi/testing/business_rules.rb +17 -23
- data/lib/remi/testing/data_stub.rb +2 -2
- data/lib/remi/version.rb +1 -1
- data/remi.gemspec +3 -0
- data/spec/data_subject_spec.rb +475 -11
- data/spec/data_subjects/csv_file_spec.rb +69 -0
- data/spec/data_subjects/data_frame_spec.rb +52 -0
- data/spec/{extractor → data_subjects}/file_system_spec.rb +0 -0
- data/spec/{extractor → data_subjects}/local_file_spec.rb +0 -0
- data/spec/data_subjects/none_spec.rb +41 -0
- data/spec/data_subjects/postgres_spec.rb +80 -0
- data/spec/{extractor → data_subjects}/s3_file_spec.rb +0 -0
- data/spec/data_subjects/salesforce_spec.rb +117 -0
- data/spec/{extractor → data_subjects}/sftp_file_spec.rb +16 -0
- data/spec/data_subjects/sub_job_spec.rb +33 -0
- data/spec/encoder_spec.rb +38 -0
- data/spec/extractor_spec.rb +11 -0
- data/spec/fixtures/sf_bulk_helper_stubs.rb +443 -0
- data/spec/job/transform_spec.rb +257 -0
- data/spec/job_spec.rb +507 -0
- data/spec/loader_spec.rb +11 -0
- data/spec/parser_spec.rb +38 -0
- data/spec/sf_bulk_helper_spec.rb +117 -0
- data/spec/testing/data_stub_spec.rb +5 -3
- metadata +109 -27
- data/features/aggregate.feature +0 -42
- data/jobs/aggregate_job.rb +0 -31
- data/jobs/transforms/transform_jobs.rb +0 -4
- data/lib/remi/data_subject/csv_file.rb +0 -162
- data/lib/remi/data_subject/data_frame.rb +0 -52
- data/lib/remi/data_subject/postgres.rb +0 -134
- data/lib/remi/data_subject/salesforce.rb +0 -136
- data/lib/remi/data_subject/sftp_file.rb +0 -65
- data/lib/remi/extractor/file_system.rb +0 -92
- data/lib/remi/extractor/local_file.rb +0 -43
- data/lib/remi/extractor/s3_file.rb +0 -57
- data/lib/remi/extractor/sftp_file.rb +0 -83
- data/spec/data_subject/csv_file_spec.rb +0 -79
- data/spec/data_subject/data_frame.rb +0 -27
@@ -0,0 +1,196 @@
|
|
1
|
+
module Remi
|
2
|
+
|
3
|
+
# Sftp File extractor
|
4
|
+
# Used to extract files from an SFTP server
|
5
|
+
#
|
6
|
+
# @example
|
7
|
+
#
|
8
|
+
# class MyJob < Remi::Job
|
9
|
+
# source :some_file do
|
10
|
+
# extractor Remi::Extractor::SftpFile.new(
|
11
|
+
# credentials: {
|
12
|
+
# host: 'coolserver.com',
|
13
|
+
# username: 'myself',
|
14
|
+
# password: 'secret'
|
15
|
+
# },
|
16
|
+
# remote_path: '/',
|
17
|
+
# pattern: /^some_file_\d{14}\.csv/,
|
18
|
+
# most_recent_only: true
|
19
|
+
# )
|
20
|
+
#
|
21
|
+
# parser Remi::Parser::CsvFile.new(
|
22
|
+
# csv_options: {
|
23
|
+
# headers: true,
|
24
|
+
# col_sep: ','
|
25
|
+
# }
|
26
|
+
# )
|
27
|
+
# end
|
28
|
+
# end
|
29
|
+
#
|
30
|
+
# job = MyJob.new
|
31
|
+
# job.some_file.df
|
32
|
+
# # =>#<Daru::DataFrame:70153153438500 @name = 4c59cfdd-7de7-4264-8666-83153f46a9e4 @size = 3>
|
33
|
+
# # id name
|
34
|
+
# # 0 1 Albert
|
35
|
+
# # 1 2 Betsy
|
36
|
+
# # 2 3 Camu
|
37
|
+
class Extractor::SftpFile < Extractor::FileSystem
|
38
|
+
N_RETRY = 3
|
39
|
+
|
40
|
+
# @param credentials [Hash] Options hash containing login credentials
|
41
|
+
# @param credentials [String] :host SFTP host (e.g., coolserver.com)
|
42
|
+
# @param credentials [String] :username SFTP username
|
43
|
+
# @param credentials [String] :password SFTP password
|
44
|
+
# @param credentials [String] :port SFTP port (default: 22)
|
45
|
+
def initialize(*args, **kargs, &block)
|
46
|
+
super
|
47
|
+
init_sftp_extractor(*args, **kargs)
|
48
|
+
end
|
49
|
+
|
50
|
+
attr_reader :host
|
51
|
+
attr_reader :username
|
52
|
+
attr_reader :password
|
53
|
+
attr_reader :port
|
54
|
+
|
55
|
+
# Called to extract files from the source filesystem.
|
56
|
+
# @return [Array<String>] An array of paths to a local copy of the files extacted
|
57
|
+
def extract
|
58
|
+
connection do |sftp|
|
59
|
+
entries.map do |entry|
|
60
|
+
local_file = File.join(@local_path, entry.name)
|
61
|
+
logger.info "Downloading #{entry.name} to #{local_file}"
|
62
|
+
retry_download { sftp.download!(File.join(@remote_path, entry.name), local_file) }
|
63
|
+
local_file
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
# @return [Array<Extractor::FileSystemEntry>] (Memoized) list of objects in the bucket/prefix
|
69
|
+
def all_entries
|
70
|
+
@all_entries ||= all_entries!
|
71
|
+
end
|
72
|
+
|
73
|
+
# @return [Array<Extractor::FileSystemEntry>] (Memoized) list of objects in the bucket/prefix
|
74
|
+
def all_entries!
|
75
|
+
sftp_entries = connection { |sftp| sftp.dir.entries(@remote_path) }
|
76
|
+
sftp_entries.map do |entry|
|
77
|
+
# Early versions of the protocol don't support create time, fake it with modified time?
|
78
|
+
FileSystemEntry.new(
|
79
|
+
pathname: File.join(@remote_path, entry.name),
|
80
|
+
create_time: entry.attributes.respond_to?(:createtime) ? entry.attributes.createtime : entry.attributes.mtime,
|
81
|
+
modified_time: entry.attributes.mtime
|
82
|
+
)
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
|
87
|
+
private
|
88
|
+
|
89
|
+
def init_sftp_extractor(*args, credentials:, **kargs)
|
90
|
+
@host = credentials.fetch(:host)
|
91
|
+
@username = credentials.fetch(:username)
|
92
|
+
@password = credentials.fetch(:password)
|
93
|
+
@port = credentials.fetch(:port, '22')
|
94
|
+
end
|
95
|
+
|
96
|
+
def connection(&block)
|
97
|
+
result = nil
|
98
|
+
Net::SFTP.start(@host, @username, password: @password, port: @port) do |sftp|
|
99
|
+
result = yield sftp
|
100
|
+
end
|
101
|
+
result
|
102
|
+
end
|
103
|
+
|
104
|
+
def retry_download(&block)
|
105
|
+
1.upto(N_RETRY).each do |itry|
|
106
|
+
begin
|
107
|
+
block.call
|
108
|
+
break
|
109
|
+
rescue RuntimeError => err
|
110
|
+
raise err unless itry < N_RETRY
|
111
|
+
logger.error "Download failed with error: #{err.message}"
|
112
|
+
logger.error "Retry attempt #{itry}/#{N_RETRY-1}"
|
113
|
+
sleep(1)
|
114
|
+
end
|
115
|
+
end
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
|
120
|
+
|
121
|
+
# SFTP file loader
|
122
|
+
#
|
123
|
+
# @example
|
124
|
+
# class MyJob < Remi::Job
|
125
|
+
# target :my_target do
|
126
|
+
# encoder Remi::Encoder::CsvFile.new(
|
127
|
+
# csv_options: { col_sep: '|' }
|
128
|
+
# )
|
129
|
+
# loader Remi::Loader::SftpFile.new(
|
130
|
+
# credentials: { },
|
131
|
+
# remote_path: 'some_test.csv'
|
132
|
+
# )
|
133
|
+
# loader Remi::Loader::SftpFile.new(
|
134
|
+
# credentials: { },
|
135
|
+
# remote_path: 'some_other_test.csv'
|
136
|
+
# )
|
137
|
+
# end
|
138
|
+
# end
|
139
|
+
#
|
140
|
+
# my_df = Daru::DataFrame.new({ a: 1.upto(5).to_a, b: 6.upto(10) })
|
141
|
+
# job = MyJob.new
|
142
|
+
# job.my_target.df = my_df
|
143
|
+
# job.my_target.load
|
144
|
+
class Loader::SftpFile < Loader
|
145
|
+
|
146
|
+
# @param remote_path [String, Pathname] Full path to the file to be created on the target filesystem
|
147
|
+
def initialize(*args, **kargs, &block)
|
148
|
+
super
|
149
|
+
init_sftp_loader(*args, **kargs, &block)
|
150
|
+
end
|
151
|
+
|
152
|
+
attr_reader :remote_path
|
153
|
+
|
154
|
+
# Copies data to the SFTP Server
|
155
|
+
# @param data [Object] The path to the file in the temporary work location
|
156
|
+
# @return [true] On success
|
157
|
+
def load(data)
|
158
|
+
logger.info "Uploading #{data} to #{@credentials[:username]}@#{@credentials[:host]}: #{@remote_path}"
|
159
|
+
connection do |sftp|
|
160
|
+
retry_upload { sftp.upload! data, @remote_path }
|
161
|
+
end
|
162
|
+
|
163
|
+
true
|
164
|
+
end
|
165
|
+
|
166
|
+
|
167
|
+
private
|
168
|
+
|
169
|
+
def init_sftp_loader(*args, credentials:, remote_path:, **kargs, &block)
|
170
|
+
@credentials = credentials
|
171
|
+
@remote_path = remote_path
|
172
|
+
end
|
173
|
+
|
174
|
+
def connection(&block)
|
175
|
+
result = nil
|
176
|
+
Net::SFTP.start(@credentials[:host], @credentials[:username], password: @credentials[:password], port: @credentials[:port] || '22') do |sftp|
|
177
|
+
result = yield sftp
|
178
|
+
end
|
179
|
+
result
|
180
|
+
end
|
181
|
+
|
182
|
+
def retry_upload(ntry=2, &block)
|
183
|
+
1.upto(ntry).each do |itry|
|
184
|
+
begin
|
185
|
+
block.call
|
186
|
+
break
|
187
|
+
rescue RuntimeError => err
|
188
|
+
raise err unless itry < ntry
|
189
|
+
logger.error "Upload failed with error: #{err.message}"
|
190
|
+
logger.error "Retry attempt #{itry}/#{ntry-1}"
|
191
|
+
sleep(1)
|
192
|
+
end
|
193
|
+
end
|
194
|
+
end
|
195
|
+
end
|
196
|
+
end
|
@@ -0,0 +1,50 @@
|
|
1
|
+
module Remi
|
2
|
+
class Extractor::SubJob < Extractor
|
3
|
+
|
4
|
+
# @param sub_job [Object] The name (relative to parent job) of the subjob to use
|
5
|
+
# @param data_subject [Symbol] The name (relatvie to the sub job) of the sub job's data frame
|
6
|
+
def initialize(*args, **kargs, &block)
|
7
|
+
super
|
8
|
+
init_sub_job_extractor(*args, **kargs, &block)
|
9
|
+
end
|
10
|
+
|
11
|
+
attr_accessor :sub_job, :data_subject
|
12
|
+
|
13
|
+
def extract
|
14
|
+
sub_job.job.send(data_subject).df
|
15
|
+
end
|
16
|
+
|
17
|
+
private
|
18
|
+
|
19
|
+
def init_sub_job_extractor(*args, sub_job:, data_subject:, **kargs, &block)
|
20
|
+
@sub_job = sub_job
|
21
|
+
@data_subject = data_subject
|
22
|
+
end
|
23
|
+
|
24
|
+
end
|
25
|
+
|
26
|
+
class Loader::SubJob < Loader
|
27
|
+
# @param sub_job [Object] The name (relative to parent job) of the subjob to use
|
28
|
+
# @param data_subject [Symbol] The name (relatvie to the sub job) of the sub job's data frame
|
29
|
+
def initialize(*args, **kargs, &block)
|
30
|
+
super
|
31
|
+
init_sub_job_loader(*args, **kargs, &block)
|
32
|
+
end
|
33
|
+
|
34
|
+
attr_accessor :sub_job, :data_subject
|
35
|
+
|
36
|
+
# @param data_frame [Object] Data frame to load to target sub job data subject
|
37
|
+
# @return [true] On success
|
38
|
+
def load(data_frame)
|
39
|
+
sub_job.job.send(data_subject).df = data_frame
|
40
|
+
true
|
41
|
+
end
|
42
|
+
|
43
|
+
private
|
44
|
+
|
45
|
+
def init_sub_job_loader(*args, sub_job:, data_subject:, **kargs, &block)
|
46
|
+
@sub_job = sub_job
|
47
|
+
@data_subject = data_subject
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
data/lib/remi/dsl.rb
ADDED
@@ -0,0 +1,74 @@
|
|
1
|
+
module Remi
|
2
|
+
|
3
|
+
# @api private
|
4
|
+
#
|
5
|
+
# A namespace for functions relating to the execution of a block against a
|
6
|
+
# proxy object.
|
7
|
+
#
|
8
|
+
# Much of this code was borrowed from [Docile](https://github.com/ms-ati/docile)
|
9
|
+
# and was modified to support different fallback contexts.
|
10
|
+
# @see Docile [Docile](https://github.com/ms-ati/docile)
|
11
|
+
|
12
|
+
module Dsl
|
13
|
+
# Execute a block in the context of an object whose methods represent the
|
14
|
+
# commands in a DSL, using a specific proxy class.
|
15
|
+
#
|
16
|
+
# @param dsl [Object] context object whose methods make up the
|
17
|
+
# (initial) DSL
|
18
|
+
# @param fallback_dsl [Object] context object that the DSL should fall back
|
19
|
+
# to if the primary context fails to resolve
|
20
|
+
# @param proxy_type [FallbackContextProxy, ChainingFallbackContextProxy]
|
21
|
+
# which class to instantiate as proxy context
|
22
|
+
# @param args [Array] arguments to be passed to the block
|
23
|
+
# @param block [Proc] the block of DSL commands to be executed
|
24
|
+
# @return [Object] the return value of the block
|
25
|
+
|
26
|
+
def exec_in_proxy_context(dsl, fallback_dsl, proxy_type, *args, &block)
|
27
|
+
block_context = fallback_dsl
|
28
|
+
proxy_context = proxy_type.new(dsl, block_context)
|
29
|
+
begin
|
30
|
+
block_context.instance_variables.each do |ivar|
|
31
|
+
value_from_block = block_context.instance_variable_get(ivar)
|
32
|
+
proxy_context.instance_variable_set(ivar, value_from_block)
|
33
|
+
end
|
34
|
+
proxy_context.instance_exec(*args, &block)
|
35
|
+
ensure
|
36
|
+
block_context.instance_variables.each do |ivar|
|
37
|
+
value_from_dsl_proxy = proxy_context.instance_variable_get(ivar)
|
38
|
+
block_context.instance_variable_set(ivar, value_from_dsl_proxy)
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
module_function :exec_in_proxy_context
|
43
|
+
|
44
|
+
|
45
|
+
# Execute a block in the context of an object whose methods represent the
|
46
|
+
# commands in a DSL.
|
47
|
+
#
|
48
|
+
# @note Use with an *imperative* DSL (commands modify the context object)
|
49
|
+
#
|
50
|
+
# Use this method to execute an *imperative* DSL, which means that:
|
51
|
+
#
|
52
|
+
# 1. Each command mutates the state of the DSL context object
|
53
|
+
# 2. The return value of each command is ignored
|
54
|
+
# 3. The final return value is the original context object
|
55
|
+
#
|
56
|
+
#
|
57
|
+
# @param dsl [Object] context object whose methods make up the DSL
|
58
|
+
# @param fallback_dsl [Object] context object that the DSL should fallback to
|
59
|
+
# @param args [Array] arguments to be passed to the block
|
60
|
+
# @param block [Proc] the block of DSL commands to be executed against the
|
61
|
+
# `dsl` context object
|
62
|
+
# @return [Object] the `dsl` context object after executing the block
|
63
|
+
def dsl_eval(dsl, fallback_dsl, *args, &block)
|
64
|
+
exec_in_proxy_context(dsl, fallback_dsl, Docile::FallbackContextProxy, *args, &block)
|
65
|
+
dsl
|
66
|
+
end
|
67
|
+
module_function :dsl_eval
|
68
|
+
|
69
|
+
def dsl_return(dsl, fallback_dsl, *args, &block)
|
70
|
+
exec_in_proxy_context(dsl, fallback_dsl, Docile::FallbackContextProxy, *args, &block)
|
71
|
+
end
|
72
|
+
module_function :dsl_return
|
73
|
+
end
|
74
|
+
end
|
data/lib/remi/encoder.rb
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
module Remi
|
2
|
+
# An encoder is an object tha converts a dataframe into a form that can
|
3
|
+
# be used by a Remi::Loader. This is a parent class meant to be
|
4
|
+
# inherited by child classes that define specific ways to parse
|
5
|
+
# data.
|
6
|
+
class Encoder
|
7
|
+
|
8
|
+
# @param context [Object] The context (e.g., DataTarget) for the encoder (default: `nil`)
|
9
|
+
# @param field_symbolizer [Proc] The field symbolizer to use for this encoder
|
10
|
+
# @param fields [Remi::Fields] A hash of field metadata to be used by the encoder
|
11
|
+
def initialize(*args, context: nil, field_symbolizer: Remi::FieldSymbolizers[:standard], fields: Remi::Fields.new({}), logger: Remi::Settings.logger, **kargs, &block)
|
12
|
+
@context = context
|
13
|
+
@field_symbolizer = field_symbolizer
|
14
|
+
|
15
|
+
@fields = fields
|
16
|
+
@logger = logger
|
17
|
+
end
|
18
|
+
|
19
|
+
attr_accessor :context
|
20
|
+
attr_accessor :logger
|
21
|
+
attr_writer :field_symbolizer
|
22
|
+
attr_writer :fields
|
23
|
+
|
24
|
+
# Any child classes need to define an encode method that converts the
|
25
|
+
# data subject's dataframe into a structure that can be loaded into the
|
26
|
+
# target system.
|
27
|
+
# @param dataframe [Remi::DataFrame] The dataframe to be encoded
|
28
|
+
# @return [Object] The encoded data to be loaded into the target
|
29
|
+
def encode(dataframe)
|
30
|
+
raise NoMethodError, "#{__method__} not defined for #{self.class.name}"
|
31
|
+
end
|
32
|
+
|
33
|
+
# @return [Proc] The field symbolizer (uses the context field symbolizer if defined)
|
34
|
+
def field_symbolizer
|
35
|
+
return context.field_symbolizer if context if context.respond_to? :field_symbolizer
|
36
|
+
@field_symbolizer
|
37
|
+
end
|
38
|
+
|
39
|
+
# @return [Remi::Fields] The fields (uses the context fields if defined)
|
40
|
+
def fields
|
41
|
+
return context.fields if context if context.respond_to? :fields
|
42
|
+
@fields
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
module Remi
|
2
|
+
# An extractor is an object meant to extract data from some external system.
|
3
|
+
# This is a parent class meant to be inherited by child classes that
|
4
|
+
# define specific ways to extract data.
|
5
|
+
class Extractor
|
6
|
+
|
7
|
+
def initialize(*args, logger: Remi::Settings.logger, **kargs, &block)
|
8
|
+
@logger = logger
|
9
|
+
end
|
10
|
+
|
11
|
+
# @return [Object] The logger object used by the extractor
|
12
|
+
attr_accessor :logger
|
13
|
+
|
14
|
+
# Any child classes need to define an extract method that returns data
|
15
|
+
# in a format that an appropriate parser can use to convert into a dataframe
|
16
|
+
def extract
|
17
|
+
raise NoMethodError, "#{__method__} not defined for #{self.class.name}"
|
18
|
+
end
|
19
|
+
|
20
|
+
end
|
21
|
+
end
|
data/lib/remi/job.rb
CHANGED
@@ -1,176 +1,342 @@
|
|
1
1
|
module Remi
|
2
|
-
module Job
|
3
|
-
module JobClassMethods
|
4
|
-
attr_accessor :params
|
5
|
-
attr_accessor :sources
|
6
|
-
attr_accessor :targets
|
7
|
-
attr_accessor :transforms
|
8
2
|
|
9
|
-
|
10
|
-
|
11
|
-
|
3
|
+
# The Job class is the foundation for all Remi ETL jobs. It
|
4
|
+
# provides a DSL for defining Remi jobs in a way that is natural for
|
5
|
+
# ETL style applications. In a Remi job, the user defines all of
|
6
|
+
# the sources, transforms, and targets necessary to transform data.
|
7
|
+
# Any number of sources, transforms, and targets can be defined.
|
8
|
+
# Transforms can call other parameterized sub-transforms. Jobs can
|
9
|
+
# collect data from other parameterized sub-jobs, pass data to other
|
10
|
+
# sub-jobs, or both pass and collect data from other sub-jobs.
|
11
|
+
#
|
12
|
+
# Jobs are executed by calling the `#execute` method in an instance
|
13
|
+
# of the job. This triggers all transforms to be executed in the
|
14
|
+
# order they are defined. Sub-transforms are only executed if they
|
15
|
+
# are referenced in a transform. After all transforms have
|
16
|
+
# executed, the targets are loaded in the order they are defined.
|
17
|
+
#
|
18
|
+
#
|
19
|
+
#
|
20
|
+
# @example
|
21
|
+
#
|
22
|
+
# class MyJob < Remi::Job
|
23
|
+
# source :my_csv_file do
|
24
|
+
# extractor my_extractor
|
25
|
+
# parser my_parser
|
26
|
+
# enforce_types
|
27
|
+
# end
|
28
|
+
#
|
29
|
+
# target :my_transformed_file do
|
30
|
+
# loader my_loader
|
31
|
+
# end
|
32
|
+
#
|
33
|
+
# transform :transform_data do
|
34
|
+
# # Data sources are converted into a dataframe the first time the #df method is called.
|
35
|
+
# transform_work = my_csv_file.df.dup # => a copy of the my_csv_file.df dataframe
|
36
|
+
#
|
37
|
+
# # Any arbitrary Ruby is allowed in a transform block. Remi provides a convenient
|
38
|
+
# # source to target map DSL to map fields from sources to targets
|
39
|
+
# Remi::SourceToTargetMap.apply(transform_work, my_transformed_file.df) do
|
40
|
+
# map source(:source_field_id) .target(:prefixed_id)
|
41
|
+
# .transform(->(v) { "PREFIX#{v}" })
|
42
|
+
# end
|
43
|
+
# end
|
44
|
+
# end
|
45
|
+
#
|
46
|
+
# # The job is executed when `#execute` is called on an instance of the job.
|
47
|
+
# # Transforms are executed in the order they are defined. Targets are loaded
|
48
|
+
# # in the order they are defined after all transforms have been executed.
|
49
|
+
# job = MyJob.new
|
50
|
+
# job.execute
|
51
|
+
#
|
52
|
+
#
|
53
|
+
#
|
54
|
+
# @todo MOAR Examples! Subtransforms, subjobs, parameters, references to even more
|
55
|
+
# complete sample jobs.
|
56
|
+
class Job
|
57
|
+
class << self
|
58
|
+
|
59
|
+
def inherited(base)
|
60
|
+
base.instance_variable_set(:@params, params.clone)
|
61
|
+
base.instance_variable_set(:@sources, sources.dup)
|
62
|
+
base.instance_variable_set(:@targets, targets.dup)
|
63
|
+
base.instance_variable_set(:@transforms, transforms.dup)
|
64
|
+
base.instance_variable_set(:@sub_jobs, sub_jobs.dup)
|
12
65
|
end
|
13
66
|
|
14
|
-
|
67
|
+
# @return [Job::Parameters] all parameters defined at the class level
|
68
|
+
def params
|
69
|
+
@params ||= Parameters.new
|
70
|
+
end
|
71
|
+
|
72
|
+
# Defines a job parameter.
|
73
|
+
# @example
|
74
|
+
#
|
75
|
+
# class MyJob < Job
|
76
|
+
# param(:my_param) { 'the best parameter' }
|
77
|
+
# end
|
78
|
+
#
|
79
|
+
# job = MyJob.new
|
80
|
+
# job.params[:my_param] #=> 'the best parameter'
|
81
|
+
def param(name, &block)
|
82
|
+
params.__define__(name, &block)
|
83
|
+
end
|
84
|
+
|
85
|
+
# @return [Array<Symbol>] the list of data source names
|
86
|
+
def sources
|
15
87
|
@sources ||= []
|
16
|
-
|
88
|
+
end
|
17
89
|
|
18
|
-
define_method(name) do
|
19
|
-
iv_name = instance_variable_get("@#{name}")
|
20
|
-
return iv_name if iv_name
|
21
90
|
|
22
|
-
|
91
|
+
# @return [Array<Symbol>] the list of sub-jobs
|
92
|
+
def sub_jobs
|
93
|
+
@sub_jobs ||= []
|
94
|
+
end
|
95
|
+
|
96
|
+
# Defines a sub job resource for this job.
|
97
|
+
# Note that the return value of the DSL block must be an instance of a Remi::Job
|
98
|
+
# @example
|
99
|
+
#
|
100
|
+
# class MyJob < Job
|
101
|
+
# sub_job(:my_sub_job) { MySubJob.new }
|
102
|
+
# end
|
103
|
+
#
|
104
|
+
# job = MyJob.new
|
105
|
+
# job.sub_job.job #=> An instance of MySubJob
|
106
|
+
def sub_job(name, &block)
|
107
|
+
sub_jobs << name unless sub_jobs.include? name
|
108
|
+
attr_accessor name
|
109
|
+
|
110
|
+
define_method("__init_#{name}__".to_sym) do
|
111
|
+
sub_job = Job::SubJob.new(self, name: name, &block)
|
112
|
+
instance_variable_set("@#{name}", sub_job)
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
# Defines a data source.
|
117
|
+
# @example
|
118
|
+
#
|
119
|
+
# class MyJob < Job
|
120
|
+
# source :my_source do
|
121
|
+
# extractor my_extractor
|
122
|
+
# parser my_parser
|
123
|
+
# end
|
124
|
+
# end
|
125
|
+
#
|
126
|
+
# job = MyJob.new
|
127
|
+
# job.my_source.df #=> a dataframe generated after extracting and parsing
|
128
|
+
def source(name, &block)
|
129
|
+
sources << name unless sources.include? name
|
130
|
+
attr_accessor name
|
131
|
+
|
132
|
+
define_method("__init_#{name}__".to_sym) do
|
133
|
+
source = DataSource.new(self, name: name, &block)
|
23
134
|
instance_variable_set("@#{name}", source)
|
24
135
|
end
|
25
136
|
end
|
26
137
|
|
27
|
-
|
138
|
+
# @return [Array<Symbol>] the list of data target names
|
139
|
+
def targets
|
28
140
|
@targets ||= []
|
29
|
-
|
30
|
-
|
31
|
-
define_method(name) do
|
32
|
-
iv_name = instance_variable_get("@#{name}")
|
33
|
-
return iv_name if iv_name
|
141
|
+
end
|
34
142
|
|
35
|
-
|
143
|
+
# Defines a data target.
|
144
|
+
# @example
|
145
|
+
#
|
146
|
+
# class MyJob < Job
|
147
|
+
# target :my_target do
|
148
|
+
# extractor my_extractor
|
149
|
+
# parser my_parser
|
150
|
+
# end
|
151
|
+
# end
|
152
|
+
#
|
153
|
+
# job = MyJob.new
|
154
|
+
# job.my_target.df #=> a dataframe generated after extracting and parsing
|
155
|
+
def target(name, &block)
|
156
|
+
targets << name unless targets.include? name
|
157
|
+
attr_accessor name
|
158
|
+
|
159
|
+
define_method("__init_#{name}__".to_sym) do
|
160
|
+
target = DataTarget.new(self, name: name, &block)
|
36
161
|
instance_variable_set("@#{name}", target)
|
37
162
|
end
|
38
163
|
end
|
39
164
|
|
40
|
-
|
41
|
-
|
42
|
-
@transforms[
|
165
|
+
# @return [Array<Symbol>] the list of transform names
|
166
|
+
def transforms
|
167
|
+
@transforms ||= []
|
168
|
+
end
|
43
169
|
|
44
|
-
|
45
|
-
|
46
|
-
|
170
|
+
# Defines a transform.
|
171
|
+
# @example
|
172
|
+
#
|
173
|
+
# class MyJob < Job
|
174
|
+
# transform :my_transform do
|
175
|
+
# puts "hello from my_transform!"
|
176
|
+
# end
|
177
|
+
# end
|
178
|
+
#
|
179
|
+
# job = MyJob.new
|
180
|
+
# job.my_transform.execute #=>(stdout) 'hello from my_transform!'
|
181
|
+
def transform(name, &block)
|
182
|
+
transforms << name unless transforms.include? name
|
183
|
+
attr_accessor name
|
184
|
+
|
185
|
+
define_method("__init_#{name}__".to_sym) do
|
186
|
+
transform = Transform.new(self, name: name, &block)
|
187
|
+
instance_variable_set("@#{name}", transform)
|
47
188
|
end
|
48
189
|
end
|
49
190
|
|
50
|
-
|
51
|
-
|
191
|
+
# Defines a sub-transform.
|
192
|
+
# @example
|
193
|
+
#
|
194
|
+
# class MyJob < Job
|
195
|
+
# sub_transform :my_sub_transform, greeting: 'hello' do
|
196
|
+
# puts "#{params[:greeting]} from my_sub_transform!"
|
197
|
+
# end
|
198
|
+
#
|
199
|
+
# transform :my_transform do
|
200
|
+
# import :my_sub_transform, greeting: 'bonjour' do
|
201
|
+
# end
|
202
|
+
# end
|
203
|
+
# end
|
204
|
+
#
|
205
|
+
# job = MyJob.new
|
206
|
+
# job.my_transform.execute #=>(stdout) 'bonjour from my_sub_transform!'
|
207
|
+
def sub_transform(name, **kargs, &block)
|
208
|
+
define_method(name) do
|
209
|
+
Transform.new(self, name: name, **kargs, &block)
|
210
|
+
end
|
52
211
|
end
|
212
|
+
end
|
53
213
|
|
54
|
-
|
55
|
-
|
56
|
-
|
214
|
+
# Initializes the job
|
215
|
+
#
|
216
|
+
# @param work_dir [String, Path] sets the working directory for this job
|
217
|
+
# @param logger [Object] sets the logger for the job
|
218
|
+
# @param kargs [Hash] Optional job parameters (can be referenced in the job via `#params`)
|
219
|
+
def initialize(work_dir: Settings.work_dir, logger: Settings.logger, **kargs)
|
220
|
+
@work_dir = work_dir
|
221
|
+
@logger = logger
|
222
|
+
create_work_dir
|
57
223
|
|
58
|
-
|
59
|
-
|
60
|
-
|
224
|
+
__init_params__ **kargs
|
225
|
+
__init_sub_jobs__
|
226
|
+
__init_sources__
|
227
|
+
__init_targets__
|
228
|
+
__init_transforms__
|
229
|
+
end
|
61
230
|
|
62
|
-
|
63
|
-
|
64
|
-
end
|
231
|
+
# @return [String] the working directory used for temporary data
|
232
|
+
attr_reader :work_dir
|
65
233
|
|
234
|
+
# @return [Object] the logging object
|
235
|
+
attr_reader :logger
|
66
236
|
|
67
|
-
|
68
|
-
|
69
|
-
end
|
237
|
+
# @return [Job::Parameters] parameters defined at the class level or during instantiation
|
238
|
+
attr_reader :params
|
70
239
|
|
71
|
-
|
72
|
-
|
240
|
+
# @return [Array] list of sub_jobs defined in the job
|
241
|
+
attr_reader :sub_jobs
|
73
242
|
|
74
|
-
|
75
|
-
|
76
|
-
receiver.params = self.params.merge(receiver.params)
|
77
|
-
receiver.sources = self.sources + receiver.sources
|
78
|
-
receiver.targets = self.targets + receiver.targets
|
79
|
-
receiver.transforms = self.transforms.merge(receiver.transforms)
|
80
|
-
end
|
81
|
-
end
|
243
|
+
# @return [Array] list of sources defined in the job
|
244
|
+
attr_reader :sources
|
82
245
|
|
83
|
-
|
84
|
-
|
85
|
-
end
|
246
|
+
# @return [Array] list of targets defined in the job
|
247
|
+
attr_reader :targets
|
86
248
|
|
249
|
+
# @return [Array] list of transforms defined in the job
|
250
|
+
attr_reader :transforms
|
87
251
|
|
88
|
-
def params
|
89
|
-
self.class.params
|
90
|
-
end
|
91
252
|
|
92
|
-
|
93
|
-
|
253
|
+
# Creates a temporary working directory for the job
|
254
|
+
def create_work_dir
|
255
|
+
@logger.info "Creating working directory #{work_dir}"
|
256
|
+
FileUtils.mkdir_p work_dir
|
94
257
|
end
|
95
258
|
|
96
|
-
|
97
|
-
|
259
|
+
|
260
|
+
# @return [self] the job object (needed to reference parent job in transform DSL)
|
261
|
+
def job
|
262
|
+
self
|
98
263
|
end
|
99
264
|
|
100
|
-
def
|
101
|
-
|
265
|
+
def to_s
|
266
|
+
inspect
|
102
267
|
end
|
103
268
|
|
269
|
+
def inspect
|
270
|
+
"#<#{Remi::Job}>: #{self.class}\n" +
|
271
|
+
" parameters: #{params.to_h.keys}\n" +
|
272
|
+
" sources: #{sources}\n" +
|
273
|
+
" targets: #{targets}\n" +
|
274
|
+
" transforms: #{transforms}"
|
275
|
+
end
|
104
276
|
|
105
277
|
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
278
|
+
# Execute the specified components of the job.
|
279
|
+
#
|
280
|
+
# @param components [Array<symbol>] list of components to execute (e.g., `:transforms`, `:load_targets`)
|
281
|
+
#
|
282
|
+
# @return [self]
|
283
|
+
def execute(*components)
|
284
|
+
execute_transforms if components.empty? || components.include?(:transforms)
|
285
|
+
execute_load_targets if components.empty? || components.include?(:load_targets)
|
286
|
+
self
|
111
287
|
end
|
112
288
|
|
113
|
-
|
289
|
+
private
|
114
290
|
|
115
|
-
def
|
116
|
-
self.class.
|
291
|
+
def __init_params__(**kargs)
|
292
|
+
@params = self.class.params.clone
|
293
|
+
add_params **kargs
|
294
|
+
params.context = self
|
117
295
|
end
|
118
296
|
|
119
|
-
def
|
120
|
-
|
297
|
+
def __init_sub_jobs__
|
298
|
+
@sub_jobs = self.class.sub_jobs
|
299
|
+
@sub_jobs.each do |sub_job|
|
300
|
+
send("__init_#{sub_job}__".to_sym)
|
301
|
+
end
|
121
302
|
end
|
122
303
|
|
123
|
-
def
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
else
|
128
|
-
@logger.debug "Not going to delete working directory #{work_dir}"
|
129
|
-
nil
|
304
|
+
def __init_sources__
|
305
|
+
@sources = self.class.sources
|
306
|
+
@sources.each do |source|
|
307
|
+
send("__init_#{source}__".to_sym)
|
130
308
|
end
|
131
309
|
end
|
132
310
|
|
133
|
-
def
|
134
|
-
@
|
135
|
-
|
311
|
+
def __init_targets__
|
312
|
+
@targets = self.class.targets
|
313
|
+
@targets.each do |target|
|
314
|
+
send("__init_#{target}__".to_sym)
|
315
|
+
end
|
136
316
|
end
|
137
317
|
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
#
|
143
|
-
# sources - Array of source names
|
144
|
-
# targets - Array of target names
|
145
|
-
#
|
146
|
-
# Returns an array containing the result of each transform.
|
147
|
-
def run_transforms_using(sources: nil, targets: nil)
|
148
|
-
transforms.map do |t, st|
|
149
|
-
selected_sources = (st[:sources] & Array(sources || st[:sources])).size > 0
|
150
|
-
selected_targets = (st[:targets] & Array(targets || st[:targets])).size > 0
|
151
|
-
self.send(t) if selected_sources && selected_targets
|
318
|
+
def __init_transforms__
|
319
|
+
@transforms = self.class.transforms
|
320
|
+
@transforms.each do |transform|
|
321
|
+
send("__init_#{transform}__".to_sym)
|
152
322
|
end
|
153
323
|
end
|
154
324
|
|
155
|
-
|
156
|
-
|
325
|
+
# Executes all transforms defined
|
326
|
+
def execute_transforms
|
327
|
+
transforms.map { |t| send(t).execute }
|
328
|
+
self
|
157
329
|
end
|
158
330
|
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
end
|
331
|
+
# Loads all targets defined
|
332
|
+
def execute_load_targets
|
333
|
+
targets.each { |t| send(t).load }
|
334
|
+
self
|
164
335
|
end
|
165
336
|
|
166
|
-
#
|
167
|
-
|
168
|
-
|
169
|
-
def run
|
170
|
-
# Do all of the stuff here
|
171
|
-
run_all_transforms
|
172
|
-
load_all_targets
|
173
|
-
self
|
337
|
+
# Adds all parameters listed to the job parameters
|
338
|
+
def add_params(**kargs)
|
339
|
+
kargs.each { |k,v| params[k] = v }
|
174
340
|
end
|
175
341
|
end
|
176
342
|
end
|