remi 0.2.42 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.yardopts +7 -0
- data/Gemfile +1 -1
- data/Gemfile.lock +13 -26
- data/README.md +1 -1
- data/features/step_definitions/remi_step.rb +33 -13
- data/features/sub_job_example.feature +24 -0
- data/features/sub_transform_example.feature +35 -0
- data/features/sub_transform_many_to_many.feature +49 -0
- data/features/support/env_app.rb +1 -1
- data/jobs/all_jobs_shared.rb +19 -16
- data/jobs/copy_source_job.rb +11 -9
- data/jobs/csv_file_target_job.rb +10 -9
- data/jobs/json_job.rb +18 -14
- data/jobs/metadata_job.rb +33 -28
- data/jobs/parameters_job.rb +14 -11
- data/jobs/sample_job.rb +106 -77
- data/jobs/sftp_file_target_job.rb +14 -13
- data/jobs/sub_job_example_job.rb +86 -0
- data/jobs/sub_transform_example_job.rb +43 -0
- data/jobs/sub_transform_many_to_many_job.rb +46 -0
- data/jobs/transforms/concatenate_job.rb +16 -12
- data/jobs/transforms/data_frame_sieve_job.rb +24 -19
- data/jobs/transforms/date_diff_job.rb +15 -11
- data/jobs/transforms/nvl_job.rb +16 -12
- data/jobs/transforms/parse_date_job.rb +17 -14
- data/jobs/transforms/partitioner_job.rb +27 -19
- data/jobs/transforms/prefix_job.rb +13 -10
- data/jobs/transforms/truncate_job.rb +14 -10
- data/jobs/transforms/truthy_job.rb +11 -8
- data/lib/remi.rb +25 -11
- data/lib/remi/data_frame.rb +4 -4
- data/lib/remi/data_frame/daru.rb +1 -37
- data/lib/remi/data_subject.rb +234 -48
- data/lib/remi/data_subjects/csv_file.rb +171 -0
- data/lib/remi/data_subjects/data_frame.rb +106 -0
- data/lib/remi/data_subjects/file_system.rb +115 -0
- data/lib/remi/data_subjects/local_file.rb +109 -0
- data/lib/remi/data_subjects/none.rb +31 -0
- data/lib/remi/data_subjects/postgres.rb +186 -0
- data/lib/remi/data_subjects/s3_file.rb +84 -0
- data/lib/remi/data_subjects/salesforce.rb +211 -0
- data/lib/remi/data_subjects/sftp_file.rb +196 -0
- data/lib/remi/data_subjects/sub_job.rb +50 -0
- data/lib/remi/dsl.rb +74 -0
- data/lib/remi/encoder.rb +45 -0
- data/lib/remi/extractor.rb +21 -0
- data/lib/remi/field_symbolizers.rb +1 -0
- data/lib/remi/job.rb +279 -113
- data/lib/remi/job/parameters.rb +90 -0
- data/lib/remi/job/sub_job.rb +35 -0
- data/lib/remi/job/transform.rb +165 -0
- data/lib/remi/loader.rb +22 -0
- data/lib/remi/monkeys/daru.rb +4 -0
- data/lib/remi/parser.rb +44 -0
- data/lib/remi/testing/business_rules.rb +17 -23
- data/lib/remi/testing/data_stub.rb +2 -2
- data/lib/remi/version.rb +1 -1
- data/remi.gemspec +3 -0
- data/spec/data_subject_spec.rb +475 -11
- data/spec/data_subjects/csv_file_spec.rb +69 -0
- data/spec/data_subjects/data_frame_spec.rb +52 -0
- data/spec/{extractor → data_subjects}/file_system_spec.rb +0 -0
- data/spec/{extractor → data_subjects}/local_file_spec.rb +0 -0
- data/spec/data_subjects/none_spec.rb +41 -0
- data/spec/data_subjects/postgres_spec.rb +80 -0
- data/spec/{extractor → data_subjects}/s3_file_spec.rb +0 -0
- data/spec/data_subjects/salesforce_spec.rb +117 -0
- data/spec/{extractor → data_subjects}/sftp_file_spec.rb +16 -0
- data/spec/data_subjects/sub_job_spec.rb +33 -0
- data/spec/encoder_spec.rb +38 -0
- data/spec/extractor_spec.rb +11 -0
- data/spec/fixtures/sf_bulk_helper_stubs.rb +443 -0
- data/spec/job/transform_spec.rb +257 -0
- data/spec/job_spec.rb +507 -0
- data/spec/loader_spec.rb +11 -0
- data/spec/parser_spec.rb +38 -0
- data/spec/sf_bulk_helper_spec.rb +117 -0
- data/spec/testing/data_stub_spec.rb +5 -3
- metadata +109 -27
- data/features/aggregate.feature +0 -42
- data/jobs/aggregate_job.rb +0 -31
- data/jobs/transforms/transform_jobs.rb +0 -4
- data/lib/remi/data_subject/csv_file.rb +0 -162
- data/lib/remi/data_subject/data_frame.rb +0 -52
- data/lib/remi/data_subject/postgres.rb +0 -134
- data/lib/remi/data_subject/salesforce.rb +0 -136
- data/lib/remi/data_subject/sftp_file.rb +0 -65
- data/lib/remi/extractor/file_system.rb +0 -92
- data/lib/remi/extractor/local_file.rb +0 -43
- data/lib/remi/extractor/s3_file.rb +0 -57
- data/lib/remi/extractor/sftp_file.rb +0 -83
- data/spec/data_subject/csv_file_spec.rb +0 -79
- data/spec/data_subject/data_frame.rb +0 -27
@@ -0,0 +1,196 @@
|
|
1
|
+
module Remi
|
2
|
+
|
3
|
+
# Sftp File extractor
|
4
|
+
# Used to extract files from an SFTP server
|
5
|
+
#
|
6
|
+
# @example
|
7
|
+
#
|
8
|
+
# class MyJob < Remi::Job
|
9
|
+
# source :some_file do
|
10
|
+
# extractor Remi::Extractor::SftpFile.new(
|
11
|
+
# credentials: {
|
12
|
+
# host: 'coolserver.com',
|
13
|
+
# username: 'myself',
|
14
|
+
# password: 'secret'
|
15
|
+
# },
|
16
|
+
# remote_path: '/',
|
17
|
+
# pattern: /^some_file_\d{14}\.csv/,
|
18
|
+
# most_recent_only: true
|
19
|
+
# )
|
20
|
+
#
|
21
|
+
# parser Remi::Parser::CsvFile.new(
|
22
|
+
# csv_options: {
|
23
|
+
# headers: true,
|
24
|
+
# col_sep: ','
|
25
|
+
# }
|
26
|
+
# )
|
27
|
+
# end
|
28
|
+
# end
|
29
|
+
#
|
30
|
+
# job = MyJob.new
|
31
|
+
# job.some_file.df
|
32
|
+
# # =>#<Daru::DataFrame:70153153438500 @name = 4c59cfdd-7de7-4264-8666-83153f46a9e4 @size = 3>
|
33
|
+
# # id name
|
34
|
+
# # 0 1 Albert
|
35
|
+
# # 1 2 Betsy
|
36
|
+
# # 2 3 Camu
|
37
|
+
class Extractor::SftpFile < Extractor::FileSystem
|
38
|
+
N_RETRY = 3
|
39
|
+
|
40
|
+
# @param credentials [Hash] Options hash containing login credentials
|
41
|
+
# @param credentials [String] :host SFTP host (e.g., coolserver.com)
|
42
|
+
# @param credentials [String] :username SFTP username
|
43
|
+
# @param credentials [String] :password SFTP password
|
44
|
+
# @param credentials [String] :port SFTP port (default: 22)
|
45
|
+
def initialize(*args, **kargs, &block)
|
46
|
+
super
|
47
|
+
init_sftp_extractor(*args, **kargs)
|
48
|
+
end
|
49
|
+
|
50
|
+
attr_reader :host
|
51
|
+
attr_reader :username
|
52
|
+
attr_reader :password
|
53
|
+
attr_reader :port
|
54
|
+
|
55
|
+
# Called to extract files from the source filesystem.
|
56
|
+
# @return [Array<String>] An array of paths to a local copy of the files extacted
|
57
|
+
def extract
|
58
|
+
connection do |sftp|
|
59
|
+
entries.map do |entry|
|
60
|
+
local_file = File.join(@local_path, entry.name)
|
61
|
+
logger.info "Downloading #{entry.name} to #{local_file}"
|
62
|
+
retry_download { sftp.download!(File.join(@remote_path, entry.name), local_file) }
|
63
|
+
local_file
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
# @return [Array<Extractor::FileSystemEntry>] (Memoized) list of objects in the bucket/prefix
|
69
|
+
def all_entries
|
70
|
+
@all_entries ||= all_entries!
|
71
|
+
end
|
72
|
+
|
73
|
+
# @return [Array<Extractor::FileSystemEntry>] (Memoized) list of objects in the bucket/prefix
|
74
|
+
def all_entries!
|
75
|
+
sftp_entries = connection { |sftp| sftp.dir.entries(@remote_path) }
|
76
|
+
sftp_entries.map do |entry|
|
77
|
+
# Early versions of the protocol don't support create time, fake it with modified time?
|
78
|
+
FileSystemEntry.new(
|
79
|
+
pathname: File.join(@remote_path, entry.name),
|
80
|
+
create_time: entry.attributes.respond_to?(:createtime) ? entry.attributes.createtime : entry.attributes.mtime,
|
81
|
+
modified_time: entry.attributes.mtime
|
82
|
+
)
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
|
87
|
+
private
|
88
|
+
|
89
|
+
def init_sftp_extractor(*args, credentials:, **kargs)
|
90
|
+
@host = credentials.fetch(:host)
|
91
|
+
@username = credentials.fetch(:username)
|
92
|
+
@password = credentials.fetch(:password)
|
93
|
+
@port = credentials.fetch(:port, '22')
|
94
|
+
end
|
95
|
+
|
96
|
+
def connection(&block)
|
97
|
+
result = nil
|
98
|
+
Net::SFTP.start(@host, @username, password: @password, port: @port) do |sftp|
|
99
|
+
result = yield sftp
|
100
|
+
end
|
101
|
+
result
|
102
|
+
end
|
103
|
+
|
104
|
+
def retry_download(&block)
|
105
|
+
1.upto(N_RETRY).each do |itry|
|
106
|
+
begin
|
107
|
+
block.call
|
108
|
+
break
|
109
|
+
rescue RuntimeError => err
|
110
|
+
raise err unless itry < N_RETRY
|
111
|
+
logger.error "Download failed with error: #{err.message}"
|
112
|
+
logger.error "Retry attempt #{itry}/#{N_RETRY-1}"
|
113
|
+
sleep(1)
|
114
|
+
end
|
115
|
+
end
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
|
120
|
+
|
121
|
+
# SFTP file loader
|
122
|
+
#
|
123
|
+
# @example
|
124
|
+
# class MyJob < Remi::Job
|
125
|
+
# target :my_target do
|
126
|
+
# encoder Remi::Encoder::CsvFile.new(
|
127
|
+
# csv_options: { col_sep: '|' }
|
128
|
+
# )
|
129
|
+
# loader Remi::Loader::SftpFile.new(
|
130
|
+
# credentials: { },
|
131
|
+
# remote_path: 'some_test.csv'
|
132
|
+
# )
|
133
|
+
# loader Remi::Loader::SftpFile.new(
|
134
|
+
# credentials: { },
|
135
|
+
# remote_path: 'some_other_test.csv'
|
136
|
+
# )
|
137
|
+
# end
|
138
|
+
# end
|
139
|
+
#
|
140
|
+
# my_df = Daru::DataFrame.new({ a: 1.upto(5).to_a, b: 6.upto(10) })
|
141
|
+
# job = MyJob.new
|
142
|
+
# job.my_target.df = my_df
|
143
|
+
# job.my_target.load
|
144
|
+
class Loader::SftpFile < Loader
|
145
|
+
|
146
|
+
# @param remote_path [String, Pathname] Full path to the file to be created on the target filesystem
|
147
|
+
def initialize(*args, **kargs, &block)
|
148
|
+
super
|
149
|
+
init_sftp_loader(*args, **kargs, &block)
|
150
|
+
end
|
151
|
+
|
152
|
+
attr_reader :remote_path
|
153
|
+
|
154
|
+
# Copies data to the SFTP Server
|
155
|
+
# @param data [Object] The path to the file in the temporary work location
|
156
|
+
# @return [true] On success
|
157
|
+
def load(data)
|
158
|
+
logger.info "Uploading #{data} to #{@credentials[:username]}@#{@credentials[:host]}: #{@remote_path}"
|
159
|
+
connection do |sftp|
|
160
|
+
retry_upload { sftp.upload! data, @remote_path }
|
161
|
+
end
|
162
|
+
|
163
|
+
true
|
164
|
+
end
|
165
|
+
|
166
|
+
|
167
|
+
private
|
168
|
+
|
169
|
+
def init_sftp_loader(*args, credentials:, remote_path:, **kargs, &block)
|
170
|
+
@credentials = credentials
|
171
|
+
@remote_path = remote_path
|
172
|
+
end
|
173
|
+
|
174
|
+
def connection(&block)
|
175
|
+
result = nil
|
176
|
+
Net::SFTP.start(@credentials[:host], @credentials[:username], password: @credentials[:password], port: @credentials[:port] || '22') do |sftp|
|
177
|
+
result = yield sftp
|
178
|
+
end
|
179
|
+
result
|
180
|
+
end
|
181
|
+
|
182
|
+
def retry_upload(ntry=2, &block)
|
183
|
+
1.upto(ntry).each do |itry|
|
184
|
+
begin
|
185
|
+
block.call
|
186
|
+
break
|
187
|
+
rescue RuntimeError => err
|
188
|
+
raise err unless itry < ntry
|
189
|
+
logger.error "Upload failed with error: #{err.message}"
|
190
|
+
logger.error "Retry attempt #{itry}/#{ntry-1}"
|
191
|
+
sleep(1)
|
192
|
+
end
|
193
|
+
end
|
194
|
+
end
|
195
|
+
end
|
196
|
+
end
|
@@ -0,0 +1,50 @@
|
|
1
|
+
module Remi
|
2
|
+
class Extractor::SubJob < Extractor
|
3
|
+
|
4
|
+
# @param sub_job [Object] The name (relative to parent job) of the subjob to use
|
5
|
+
# @param data_subject [Symbol] The name (relatvie to the sub job) of the sub job's data frame
|
6
|
+
def initialize(*args, **kargs, &block)
|
7
|
+
super
|
8
|
+
init_sub_job_extractor(*args, **kargs, &block)
|
9
|
+
end
|
10
|
+
|
11
|
+
attr_accessor :sub_job, :data_subject
|
12
|
+
|
13
|
+
def extract
|
14
|
+
sub_job.job.send(data_subject).df
|
15
|
+
end
|
16
|
+
|
17
|
+
private
|
18
|
+
|
19
|
+
def init_sub_job_extractor(*args, sub_job:, data_subject:, **kargs, &block)
|
20
|
+
@sub_job = sub_job
|
21
|
+
@data_subject = data_subject
|
22
|
+
end
|
23
|
+
|
24
|
+
end
|
25
|
+
|
26
|
+
class Loader::SubJob < Loader
|
27
|
+
# @param sub_job [Object] The name (relative to parent job) of the subjob to use
|
28
|
+
# @param data_subject [Symbol] The name (relatvie to the sub job) of the sub job's data frame
|
29
|
+
def initialize(*args, **kargs, &block)
|
30
|
+
super
|
31
|
+
init_sub_job_loader(*args, **kargs, &block)
|
32
|
+
end
|
33
|
+
|
34
|
+
attr_accessor :sub_job, :data_subject
|
35
|
+
|
36
|
+
# @param data_frame [Object] Data frame to load to target sub job data subject
|
37
|
+
# @return [true] On success
|
38
|
+
def load(data_frame)
|
39
|
+
sub_job.job.send(data_subject).df = data_frame
|
40
|
+
true
|
41
|
+
end
|
42
|
+
|
43
|
+
private
|
44
|
+
|
45
|
+
def init_sub_job_loader(*args, sub_job:, data_subject:, **kargs, &block)
|
46
|
+
@sub_job = sub_job
|
47
|
+
@data_subject = data_subject
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
data/lib/remi/dsl.rb
ADDED
@@ -0,0 +1,74 @@
|
|
1
|
+
module Remi
|
2
|
+
|
3
|
+
# @api private
|
4
|
+
#
|
5
|
+
# A namespace for functions relating to the execution of a block against a
|
6
|
+
# proxy object.
|
7
|
+
#
|
8
|
+
# Much of this code was borrowed from [Docile](https://github.com/ms-ati/docile)
|
9
|
+
# and was modified to support different fallback contexts.
|
10
|
+
# @see Docile [Docile](https://github.com/ms-ati/docile)
|
11
|
+
|
12
|
+
module Dsl
|
13
|
+
# Execute a block in the context of an object whose methods represent the
|
14
|
+
# commands in a DSL, using a specific proxy class.
|
15
|
+
#
|
16
|
+
# @param dsl [Object] context object whose methods make up the
|
17
|
+
# (initial) DSL
|
18
|
+
# @param fallback_dsl [Object] context object that the DSL should fall back
|
19
|
+
# to if the primary context fails to resolve
|
20
|
+
# @param proxy_type [FallbackContextProxy, ChainingFallbackContextProxy]
|
21
|
+
# which class to instantiate as proxy context
|
22
|
+
# @param args [Array] arguments to be passed to the block
|
23
|
+
# @param block [Proc] the block of DSL commands to be executed
|
24
|
+
# @return [Object] the return value of the block
|
25
|
+
|
26
|
+
def exec_in_proxy_context(dsl, fallback_dsl, proxy_type, *args, &block)
|
27
|
+
block_context = fallback_dsl
|
28
|
+
proxy_context = proxy_type.new(dsl, block_context)
|
29
|
+
begin
|
30
|
+
block_context.instance_variables.each do |ivar|
|
31
|
+
value_from_block = block_context.instance_variable_get(ivar)
|
32
|
+
proxy_context.instance_variable_set(ivar, value_from_block)
|
33
|
+
end
|
34
|
+
proxy_context.instance_exec(*args, &block)
|
35
|
+
ensure
|
36
|
+
block_context.instance_variables.each do |ivar|
|
37
|
+
value_from_dsl_proxy = proxy_context.instance_variable_get(ivar)
|
38
|
+
block_context.instance_variable_set(ivar, value_from_dsl_proxy)
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
module_function :exec_in_proxy_context
|
43
|
+
|
44
|
+
|
45
|
+
# Execute a block in the context of an object whose methods represent the
|
46
|
+
# commands in a DSL.
|
47
|
+
#
|
48
|
+
# @note Use with an *imperative* DSL (commands modify the context object)
|
49
|
+
#
|
50
|
+
# Use this method to execute an *imperative* DSL, which means that:
|
51
|
+
#
|
52
|
+
# 1. Each command mutates the state of the DSL context object
|
53
|
+
# 2. The return value of each command is ignored
|
54
|
+
# 3. The final return value is the original context object
|
55
|
+
#
|
56
|
+
#
|
57
|
+
# @param dsl [Object] context object whose methods make up the DSL
|
58
|
+
# @param fallback_dsl [Object] context object that the DSL should fallback to
|
59
|
+
# @param args [Array] arguments to be passed to the block
|
60
|
+
# @param block [Proc] the block of DSL commands to be executed against the
|
61
|
+
# `dsl` context object
|
62
|
+
# @return [Object] the `dsl` context object after executing the block
|
63
|
+
def dsl_eval(dsl, fallback_dsl, *args, &block)
|
64
|
+
exec_in_proxy_context(dsl, fallback_dsl, Docile::FallbackContextProxy, *args, &block)
|
65
|
+
dsl
|
66
|
+
end
|
67
|
+
module_function :dsl_eval
|
68
|
+
|
69
|
+
def dsl_return(dsl, fallback_dsl, *args, &block)
|
70
|
+
exec_in_proxy_context(dsl, fallback_dsl, Docile::FallbackContextProxy, *args, &block)
|
71
|
+
end
|
72
|
+
module_function :dsl_return
|
73
|
+
end
|
74
|
+
end
|
data/lib/remi/encoder.rb
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
module Remi
|
2
|
+
# An encoder is an object tha converts a dataframe into a form that can
|
3
|
+
# be used by a Remi::Loader. This is a parent class meant to be
|
4
|
+
# inherited by child classes that define specific ways to parse
|
5
|
+
# data.
|
6
|
+
class Encoder
|
7
|
+
|
8
|
+
# @param context [Object] The context (e.g., DataTarget) for the encoder (default: `nil`)
|
9
|
+
# @param field_symbolizer [Proc] The field symbolizer to use for this encoder
|
10
|
+
# @param fields [Remi::Fields] A hash of field metadata to be used by the encoder
|
11
|
+
def initialize(*args, context: nil, field_symbolizer: Remi::FieldSymbolizers[:standard], fields: Remi::Fields.new({}), logger: Remi::Settings.logger, **kargs, &block)
|
12
|
+
@context = context
|
13
|
+
@field_symbolizer = field_symbolizer
|
14
|
+
|
15
|
+
@fields = fields
|
16
|
+
@logger = logger
|
17
|
+
end
|
18
|
+
|
19
|
+
attr_accessor :context
|
20
|
+
attr_accessor :logger
|
21
|
+
attr_writer :field_symbolizer
|
22
|
+
attr_writer :fields
|
23
|
+
|
24
|
+
# Any child classes need to define an encode method that converts the
|
25
|
+
# data subject's dataframe into a structure that can be loaded into the
|
26
|
+
# target system.
|
27
|
+
# @param dataframe [Remi::DataFrame] The dataframe to be encoded
|
28
|
+
# @return [Object] The encoded data to be loaded into the target
|
29
|
+
def encode(dataframe)
|
30
|
+
raise NoMethodError, "#{__method__} not defined for #{self.class.name}"
|
31
|
+
end
|
32
|
+
|
33
|
+
# @return [Proc] The field symbolizer (uses the context field symbolizer if defined)
|
34
|
+
def field_symbolizer
|
35
|
+
return context.field_symbolizer if context if context.respond_to? :field_symbolizer
|
36
|
+
@field_symbolizer
|
37
|
+
end
|
38
|
+
|
39
|
+
# @return [Remi::Fields] The fields (uses the context fields if defined)
|
40
|
+
def fields
|
41
|
+
return context.fields if context if context.respond_to? :fields
|
42
|
+
@fields
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
module Remi
|
2
|
+
# An extractor is an object meant to extract data from some external system.
|
3
|
+
# This is a parent class meant to be inherited by child classes that
|
4
|
+
# define specific ways to extract data.
|
5
|
+
class Extractor
|
6
|
+
|
7
|
+
def initialize(*args, logger: Remi::Settings.logger, **kargs, &block)
|
8
|
+
@logger = logger
|
9
|
+
end
|
10
|
+
|
11
|
+
# @return [Object] The logger object used by the extractor
|
12
|
+
attr_accessor :logger
|
13
|
+
|
14
|
+
# Any child classes need to define an extract method that returns data
|
15
|
+
# in a format that an appropriate parser can use to convert into a dataframe
|
16
|
+
def extract
|
17
|
+
raise NoMethodError, "#{__method__} not defined for #{self.class.name}"
|
18
|
+
end
|
19
|
+
|
20
|
+
end
|
21
|
+
end
|
data/lib/remi/job.rb
CHANGED
@@ -1,176 +1,342 @@
|
|
1
1
|
module Remi
|
2
|
-
module Job
|
3
|
-
module JobClassMethods
|
4
|
-
attr_accessor :params
|
5
|
-
attr_accessor :sources
|
6
|
-
attr_accessor :targets
|
7
|
-
attr_accessor :transforms
|
8
2
|
|
9
|
-
|
10
|
-
|
11
|
-
|
3
|
+
# The Job class is the foundation for all Remi ETL jobs. It
|
4
|
+
# provides a DSL for defining Remi jobs in a way that is natural for
|
5
|
+
# ETL style applications. In a Remi job, the user defines all of
|
6
|
+
# the sources, transforms, and targets necessary to transform data.
|
7
|
+
# Any number of sources, transforms, and targets can be defined.
|
8
|
+
# Transforms can call other parameterized sub-transforms. Jobs can
|
9
|
+
# collect data from other parameterized sub-jobs, pass data to other
|
10
|
+
# sub-jobs, or both pass and collect data from other sub-jobs.
|
11
|
+
#
|
12
|
+
# Jobs are executed by calling the `#execute` method in an instance
|
13
|
+
# of the job. This triggers all transforms to be executed in the
|
14
|
+
# order they are defined. Sub-transforms are only executed if they
|
15
|
+
# are referenced in a transform. After all transforms have
|
16
|
+
# executed, the targets are loaded in the order they are defined.
|
17
|
+
#
|
18
|
+
#
|
19
|
+
#
|
20
|
+
# @example
|
21
|
+
#
|
22
|
+
# class MyJob < Remi::Job
|
23
|
+
# source :my_csv_file do
|
24
|
+
# extractor my_extractor
|
25
|
+
# parser my_parser
|
26
|
+
# enforce_types
|
27
|
+
# end
|
28
|
+
#
|
29
|
+
# target :my_transformed_file do
|
30
|
+
# loader my_loader
|
31
|
+
# end
|
32
|
+
#
|
33
|
+
# transform :transform_data do
|
34
|
+
# # Data sources are converted into a dataframe the first time the #df method is called.
|
35
|
+
# transform_work = my_csv_file.df.dup # => a copy of the my_csv_file.df dataframe
|
36
|
+
#
|
37
|
+
# # Any arbitrary Ruby is allowed in a transform block. Remi provides a convenient
|
38
|
+
# # source to target map DSL to map fields from sources to targets
|
39
|
+
# Remi::SourceToTargetMap.apply(transform_work, my_transformed_file.df) do
|
40
|
+
# map source(:source_field_id) .target(:prefixed_id)
|
41
|
+
# .transform(->(v) { "PREFIX#{v}" })
|
42
|
+
# end
|
43
|
+
# end
|
44
|
+
# end
|
45
|
+
#
|
46
|
+
# # The job is executed when `#execute` is called on an instance of the job.
|
47
|
+
# # Transforms are executed in the order they are defined. Targets are loaded
|
48
|
+
# # in the order they are defined after all transforms have been executed.
|
49
|
+
# job = MyJob.new
|
50
|
+
# job.execute
|
51
|
+
#
|
52
|
+
#
|
53
|
+
#
|
54
|
+
# @todo MOAR Examples! Subtransforms, subjobs, parameters, references to even more
|
55
|
+
# complete sample jobs.
|
56
|
+
class Job
|
57
|
+
class << self
|
58
|
+
|
59
|
+
def inherited(base)
|
60
|
+
base.instance_variable_set(:@params, params.clone)
|
61
|
+
base.instance_variable_set(:@sources, sources.dup)
|
62
|
+
base.instance_variable_set(:@targets, targets.dup)
|
63
|
+
base.instance_variable_set(:@transforms, transforms.dup)
|
64
|
+
base.instance_variable_set(:@sub_jobs, sub_jobs.dup)
|
12
65
|
end
|
13
66
|
|
14
|
-
|
67
|
+
# @return [Job::Parameters] all parameters defined at the class level
|
68
|
+
def params
|
69
|
+
@params ||= Parameters.new
|
70
|
+
end
|
71
|
+
|
72
|
+
# Defines a job parameter.
|
73
|
+
# @example
|
74
|
+
#
|
75
|
+
# class MyJob < Job
|
76
|
+
# param(:my_param) { 'the best parameter' }
|
77
|
+
# end
|
78
|
+
#
|
79
|
+
# job = MyJob.new
|
80
|
+
# job.params[:my_param] #=> 'the best parameter'
|
81
|
+
def param(name, &block)
|
82
|
+
params.__define__(name, &block)
|
83
|
+
end
|
84
|
+
|
85
|
+
# @return [Array<Symbol>] the list of data source names
|
86
|
+
def sources
|
15
87
|
@sources ||= []
|
16
|
-
|
88
|
+
end
|
17
89
|
|
18
|
-
define_method(name) do
|
19
|
-
iv_name = instance_variable_get("@#{name}")
|
20
|
-
return iv_name if iv_name
|
21
90
|
|
22
|
-
|
91
|
+
# @return [Array<Symbol>] the list of sub-jobs
|
92
|
+
def sub_jobs
|
93
|
+
@sub_jobs ||= []
|
94
|
+
end
|
95
|
+
|
96
|
+
# Defines a sub job resource for this job.
|
97
|
+
# Note that the return value of the DSL block must be an instance of a Remi::Job
|
98
|
+
# @example
|
99
|
+
#
|
100
|
+
# class MyJob < Job
|
101
|
+
# sub_job(:my_sub_job) { MySubJob.new }
|
102
|
+
# end
|
103
|
+
#
|
104
|
+
# job = MyJob.new
|
105
|
+
# job.sub_job.job #=> An instance of MySubJob
|
106
|
+
def sub_job(name, &block)
|
107
|
+
sub_jobs << name unless sub_jobs.include? name
|
108
|
+
attr_accessor name
|
109
|
+
|
110
|
+
define_method("__init_#{name}__".to_sym) do
|
111
|
+
sub_job = Job::SubJob.new(self, name: name, &block)
|
112
|
+
instance_variable_set("@#{name}", sub_job)
|
113
|
+
end
|
114
|
+
end
|
115
|
+
|
116
|
+
# Defines a data source.
|
117
|
+
# @example
|
118
|
+
#
|
119
|
+
# class MyJob < Job
|
120
|
+
# source :my_source do
|
121
|
+
# extractor my_extractor
|
122
|
+
# parser my_parser
|
123
|
+
# end
|
124
|
+
# end
|
125
|
+
#
|
126
|
+
# job = MyJob.new
|
127
|
+
# job.my_source.df #=> a dataframe generated after extracting and parsing
|
128
|
+
def source(name, &block)
|
129
|
+
sources << name unless sources.include? name
|
130
|
+
attr_accessor name
|
131
|
+
|
132
|
+
define_method("__init_#{name}__".to_sym) do
|
133
|
+
source = DataSource.new(self, name: name, &block)
|
23
134
|
instance_variable_set("@#{name}", source)
|
24
135
|
end
|
25
136
|
end
|
26
137
|
|
27
|
-
|
138
|
+
# @return [Array<Symbol>] the list of data target names
|
139
|
+
def targets
|
28
140
|
@targets ||= []
|
29
|
-
|
30
|
-
|
31
|
-
define_method(name) do
|
32
|
-
iv_name = instance_variable_get("@#{name}")
|
33
|
-
return iv_name if iv_name
|
141
|
+
end
|
34
142
|
|
35
|
-
|
143
|
+
# Defines a data target.
|
144
|
+
# @example
|
145
|
+
#
|
146
|
+
# class MyJob < Job
|
147
|
+
# target :my_target do
|
148
|
+
# extractor my_extractor
|
149
|
+
# parser my_parser
|
150
|
+
# end
|
151
|
+
# end
|
152
|
+
#
|
153
|
+
# job = MyJob.new
|
154
|
+
# job.my_target.df #=> a dataframe generated after extracting and parsing
|
155
|
+
def target(name, &block)
|
156
|
+
targets << name unless targets.include? name
|
157
|
+
attr_accessor name
|
158
|
+
|
159
|
+
define_method("__init_#{name}__".to_sym) do
|
160
|
+
target = DataTarget.new(self, name: name, &block)
|
36
161
|
instance_variable_set("@#{name}", target)
|
37
162
|
end
|
38
163
|
end
|
39
164
|
|
40
|
-
|
41
|
-
|
42
|
-
@transforms[
|
165
|
+
# @return [Array<Symbol>] the list of transform names
|
166
|
+
def transforms
|
167
|
+
@transforms ||= []
|
168
|
+
end
|
43
169
|
|
44
|
-
|
45
|
-
|
46
|
-
|
170
|
+
# Defines a transform.
|
171
|
+
# @example
|
172
|
+
#
|
173
|
+
# class MyJob < Job
|
174
|
+
# transform :my_transform do
|
175
|
+
# puts "hello from my_transform!"
|
176
|
+
# end
|
177
|
+
# end
|
178
|
+
#
|
179
|
+
# job = MyJob.new
|
180
|
+
# job.my_transform.execute #=>(stdout) 'hello from my_transform!'
|
181
|
+
def transform(name, &block)
|
182
|
+
transforms << name unless transforms.include? name
|
183
|
+
attr_accessor name
|
184
|
+
|
185
|
+
define_method("__init_#{name}__".to_sym) do
|
186
|
+
transform = Transform.new(self, name: name, &block)
|
187
|
+
instance_variable_set("@#{name}", transform)
|
47
188
|
end
|
48
189
|
end
|
49
190
|
|
50
|
-
|
51
|
-
|
191
|
+
# Defines a sub-transform.
|
192
|
+
# @example
|
193
|
+
#
|
194
|
+
# class MyJob < Job
|
195
|
+
# sub_transform :my_sub_transform, greeting: 'hello' do
|
196
|
+
# puts "#{params[:greeting]} from my_sub_transform!"
|
197
|
+
# end
|
198
|
+
#
|
199
|
+
# transform :my_transform do
|
200
|
+
# import :my_sub_transform, greeting: 'bonjour' do
|
201
|
+
# end
|
202
|
+
# end
|
203
|
+
# end
|
204
|
+
#
|
205
|
+
# job = MyJob.new
|
206
|
+
# job.my_transform.execute #=>(stdout) 'bonjour from my_sub_transform!'
|
207
|
+
def sub_transform(name, **kargs, &block)
|
208
|
+
define_method(name) do
|
209
|
+
Transform.new(self, name: name, **kargs, &block)
|
210
|
+
end
|
52
211
|
end
|
212
|
+
end
|
53
213
|
|
54
|
-
|
55
|
-
|
56
|
-
|
214
|
+
# Initializes the job
|
215
|
+
#
|
216
|
+
# @param work_dir [String, Path] sets the working directory for this job
|
217
|
+
# @param logger [Object] sets the logger for the job
|
218
|
+
# @param kargs [Hash] Optional job parameters (can be referenced in the job via `#params`)
|
219
|
+
def initialize(work_dir: Settings.work_dir, logger: Settings.logger, **kargs)
|
220
|
+
@work_dir = work_dir
|
221
|
+
@logger = logger
|
222
|
+
create_work_dir
|
57
223
|
|
58
|
-
|
59
|
-
|
60
|
-
|
224
|
+
__init_params__ **kargs
|
225
|
+
__init_sub_jobs__
|
226
|
+
__init_sources__
|
227
|
+
__init_targets__
|
228
|
+
__init_transforms__
|
229
|
+
end
|
61
230
|
|
62
|
-
|
63
|
-
|
64
|
-
end
|
231
|
+
# @return [String] the working directory used for temporary data
|
232
|
+
attr_reader :work_dir
|
65
233
|
|
234
|
+
# @return [Object] the logging object
|
235
|
+
attr_reader :logger
|
66
236
|
|
67
|
-
|
68
|
-
|
69
|
-
end
|
237
|
+
# @return [Job::Parameters] parameters defined at the class level or during instantiation
|
238
|
+
attr_reader :params
|
70
239
|
|
71
|
-
|
72
|
-
|
240
|
+
# @return [Array] list of sub_jobs defined in the job
|
241
|
+
attr_reader :sub_jobs
|
73
242
|
|
74
|
-
|
75
|
-
|
76
|
-
receiver.params = self.params.merge(receiver.params)
|
77
|
-
receiver.sources = self.sources + receiver.sources
|
78
|
-
receiver.targets = self.targets + receiver.targets
|
79
|
-
receiver.transforms = self.transforms.merge(receiver.transforms)
|
80
|
-
end
|
81
|
-
end
|
243
|
+
# @return [Array] list of sources defined in the job
|
244
|
+
attr_reader :sources
|
82
245
|
|
83
|
-
|
84
|
-
|
85
|
-
end
|
246
|
+
# @return [Array] list of targets defined in the job
|
247
|
+
attr_reader :targets
|
86
248
|
|
249
|
+
# @return [Array] list of transforms defined in the job
|
250
|
+
attr_reader :transforms
|
87
251
|
|
88
|
-
def params
|
89
|
-
self.class.params
|
90
|
-
end
|
91
252
|
|
92
|
-
|
93
|
-
|
253
|
+
# Creates a temporary working directory for the job
|
254
|
+
def create_work_dir
|
255
|
+
@logger.info "Creating working directory #{work_dir}"
|
256
|
+
FileUtils.mkdir_p work_dir
|
94
257
|
end
|
95
258
|
|
96
|
-
|
97
|
-
|
259
|
+
|
260
|
+
# @return [self] the job object (needed to reference parent job in transform DSL)
|
261
|
+
def job
|
262
|
+
self
|
98
263
|
end
|
99
264
|
|
100
|
-
def
|
101
|
-
|
265
|
+
def to_s
|
266
|
+
inspect
|
102
267
|
end
|
103
268
|
|
269
|
+
def inspect
|
270
|
+
"#<#{Remi::Job}>: #{self.class}\n" +
|
271
|
+
" parameters: #{params.to_h.keys}\n" +
|
272
|
+
" sources: #{sources}\n" +
|
273
|
+
" targets: #{targets}\n" +
|
274
|
+
" transforms: #{transforms}"
|
275
|
+
end
|
104
276
|
|
105
277
|
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
278
|
+
# Execute the specified components of the job.
|
279
|
+
#
|
280
|
+
# @param components [Array<symbol>] list of components to execute (e.g., `:transforms`, `:load_targets`)
|
281
|
+
#
|
282
|
+
# @return [self]
|
283
|
+
def execute(*components)
|
284
|
+
execute_transforms if components.empty? || components.include?(:transforms)
|
285
|
+
execute_load_targets if components.empty? || components.include?(:load_targets)
|
286
|
+
self
|
111
287
|
end
|
112
288
|
|
113
|
-
|
289
|
+
private
|
114
290
|
|
115
|
-
def
|
116
|
-
self.class.
|
291
|
+
def __init_params__(**kargs)
|
292
|
+
@params = self.class.params.clone
|
293
|
+
add_params **kargs
|
294
|
+
params.context = self
|
117
295
|
end
|
118
296
|
|
119
|
-
def
|
120
|
-
|
297
|
+
def __init_sub_jobs__
|
298
|
+
@sub_jobs = self.class.sub_jobs
|
299
|
+
@sub_jobs.each do |sub_job|
|
300
|
+
send("__init_#{sub_job}__".to_sym)
|
301
|
+
end
|
121
302
|
end
|
122
303
|
|
123
|
-
def
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
else
|
128
|
-
@logger.debug "Not going to delete working directory #{work_dir}"
|
129
|
-
nil
|
304
|
+
def __init_sources__
|
305
|
+
@sources = self.class.sources
|
306
|
+
@sources.each do |source|
|
307
|
+
send("__init_#{source}__".to_sym)
|
130
308
|
end
|
131
309
|
end
|
132
310
|
|
133
|
-
def
|
134
|
-
@
|
135
|
-
|
311
|
+
def __init_targets__
|
312
|
+
@targets = self.class.targets
|
313
|
+
@targets.each do |target|
|
314
|
+
send("__init_#{target}__".to_sym)
|
315
|
+
end
|
136
316
|
end
|
137
317
|
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
#
|
143
|
-
# sources - Array of source names
|
144
|
-
# targets - Array of target names
|
145
|
-
#
|
146
|
-
# Returns an array containing the result of each transform.
|
147
|
-
def run_transforms_using(sources: nil, targets: nil)
|
148
|
-
transforms.map do |t, st|
|
149
|
-
selected_sources = (st[:sources] & Array(sources || st[:sources])).size > 0
|
150
|
-
selected_targets = (st[:targets] & Array(targets || st[:targets])).size > 0
|
151
|
-
self.send(t) if selected_sources && selected_targets
|
318
|
+
def __init_transforms__
|
319
|
+
@transforms = self.class.transforms
|
320
|
+
@transforms.each do |transform|
|
321
|
+
send("__init_#{transform}__".to_sym)
|
152
322
|
end
|
153
323
|
end
|
154
324
|
|
155
|
-
|
156
|
-
|
325
|
+
# Executes all transforms defined
|
326
|
+
def execute_transforms
|
327
|
+
transforms.map { |t| send(t).execute }
|
328
|
+
self
|
157
329
|
end
|
158
330
|
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
end
|
331
|
+
# Loads all targets defined
|
332
|
+
def execute_load_targets
|
333
|
+
targets.each { |t| send(t).load }
|
334
|
+
self
|
164
335
|
end
|
165
336
|
|
166
|
-
#
|
167
|
-
|
168
|
-
|
169
|
-
def run
|
170
|
-
# Do all of the stuff here
|
171
|
-
run_all_transforms
|
172
|
-
load_all_targets
|
173
|
-
self
|
337
|
+
# Adds all parameters listed to the job parameters
|
338
|
+
def add_params(**kargs)
|
339
|
+
kargs.each { |k,v| params[k] = v }
|
174
340
|
end
|
175
341
|
end
|
176
342
|
end
|