remi 0.2.42 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (94) hide show
  1. checksums.yaml +4 -4
  2. data/.yardopts +7 -0
  3. data/Gemfile +1 -1
  4. data/Gemfile.lock +13 -26
  5. data/README.md +1 -1
  6. data/features/step_definitions/remi_step.rb +33 -13
  7. data/features/sub_job_example.feature +24 -0
  8. data/features/sub_transform_example.feature +35 -0
  9. data/features/sub_transform_many_to_many.feature +49 -0
  10. data/features/support/env_app.rb +1 -1
  11. data/jobs/all_jobs_shared.rb +19 -16
  12. data/jobs/copy_source_job.rb +11 -9
  13. data/jobs/csv_file_target_job.rb +10 -9
  14. data/jobs/json_job.rb +18 -14
  15. data/jobs/metadata_job.rb +33 -28
  16. data/jobs/parameters_job.rb +14 -11
  17. data/jobs/sample_job.rb +106 -77
  18. data/jobs/sftp_file_target_job.rb +14 -13
  19. data/jobs/sub_job_example_job.rb +86 -0
  20. data/jobs/sub_transform_example_job.rb +43 -0
  21. data/jobs/sub_transform_many_to_many_job.rb +46 -0
  22. data/jobs/transforms/concatenate_job.rb +16 -12
  23. data/jobs/transforms/data_frame_sieve_job.rb +24 -19
  24. data/jobs/transforms/date_diff_job.rb +15 -11
  25. data/jobs/transforms/nvl_job.rb +16 -12
  26. data/jobs/transforms/parse_date_job.rb +17 -14
  27. data/jobs/transforms/partitioner_job.rb +27 -19
  28. data/jobs/transforms/prefix_job.rb +13 -10
  29. data/jobs/transforms/truncate_job.rb +14 -10
  30. data/jobs/transforms/truthy_job.rb +11 -8
  31. data/lib/remi.rb +25 -11
  32. data/lib/remi/data_frame.rb +4 -4
  33. data/lib/remi/data_frame/daru.rb +1 -37
  34. data/lib/remi/data_subject.rb +234 -48
  35. data/lib/remi/data_subjects/csv_file.rb +171 -0
  36. data/lib/remi/data_subjects/data_frame.rb +106 -0
  37. data/lib/remi/data_subjects/file_system.rb +115 -0
  38. data/lib/remi/data_subjects/local_file.rb +109 -0
  39. data/lib/remi/data_subjects/none.rb +31 -0
  40. data/lib/remi/data_subjects/postgres.rb +186 -0
  41. data/lib/remi/data_subjects/s3_file.rb +84 -0
  42. data/lib/remi/data_subjects/salesforce.rb +211 -0
  43. data/lib/remi/data_subjects/sftp_file.rb +196 -0
  44. data/lib/remi/data_subjects/sub_job.rb +50 -0
  45. data/lib/remi/dsl.rb +74 -0
  46. data/lib/remi/encoder.rb +45 -0
  47. data/lib/remi/extractor.rb +21 -0
  48. data/lib/remi/field_symbolizers.rb +1 -0
  49. data/lib/remi/job.rb +279 -113
  50. data/lib/remi/job/parameters.rb +90 -0
  51. data/lib/remi/job/sub_job.rb +35 -0
  52. data/lib/remi/job/transform.rb +165 -0
  53. data/lib/remi/loader.rb +22 -0
  54. data/lib/remi/monkeys/daru.rb +4 -0
  55. data/lib/remi/parser.rb +44 -0
  56. data/lib/remi/testing/business_rules.rb +17 -23
  57. data/lib/remi/testing/data_stub.rb +2 -2
  58. data/lib/remi/version.rb +1 -1
  59. data/remi.gemspec +3 -0
  60. data/spec/data_subject_spec.rb +475 -11
  61. data/spec/data_subjects/csv_file_spec.rb +69 -0
  62. data/spec/data_subjects/data_frame_spec.rb +52 -0
  63. data/spec/{extractor → data_subjects}/file_system_spec.rb +0 -0
  64. data/spec/{extractor → data_subjects}/local_file_spec.rb +0 -0
  65. data/spec/data_subjects/none_spec.rb +41 -0
  66. data/spec/data_subjects/postgres_spec.rb +80 -0
  67. data/spec/{extractor → data_subjects}/s3_file_spec.rb +0 -0
  68. data/spec/data_subjects/salesforce_spec.rb +117 -0
  69. data/spec/{extractor → data_subjects}/sftp_file_spec.rb +16 -0
  70. data/spec/data_subjects/sub_job_spec.rb +33 -0
  71. data/spec/encoder_spec.rb +38 -0
  72. data/spec/extractor_spec.rb +11 -0
  73. data/spec/fixtures/sf_bulk_helper_stubs.rb +443 -0
  74. data/spec/job/transform_spec.rb +257 -0
  75. data/spec/job_spec.rb +507 -0
  76. data/spec/loader_spec.rb +11 -0
  77. data/spec/parser_spec.rb +38 -0
  78. data/spec/sf_bulk_helper_spec.rb +117 -0
  79. data/spec/testing/data_stub_spec.rb +5 -3
  80. metadata +109 -27
  81. data/features/aggregate.feature +0 -42
  82. data/jobs/aggregate_job.rb +0 -31
  83. data/jobs/transforms/transform_jobs.rb +0 -4
  84. data/lib/remi/data_subject/csv_file.rb +0 -162
  85. data/lib/remi/data_subject/data_frame.rb +0 -52
  86. data/lib/remi/data_subject/postgres.rb +0 -134
  87. data/lib/remi/data_subject/salesforce.rb +0 -136
  88. data/lib/remi/data_subject/sftp_file.rb +0 -65
  89. data/lib/remi/extractor/file_system.rb +0 -92
  90. data/lib/remi/extractor/local_file.rb +0 -43
  91. data/lib/remi/extractor/s3_file.rb +0 -57
  92. data/lib/remi/extractor/sftp_file.rb +0 -83
  93. data/spec/data_subject/csv_file_spec.rb +0 -79
  94. data/spec/data_subject/data_frame.rb +0 -27
@@ -0,0 +1,196 @@
1
+ module Remi
2
+
3
+ # Sftp File extractor
4
+ # Used to extract files from an SFTP server
5
+ #
6
+ # @example
7
+ #
8
+ # class MyJob < Remi::Job
9
+ # source :some_file do
10
+ # extractor Remi::Extractor::SftpFile.new(
11
+ # credentials: {
12
+ # host: 'coolserver.com',
13
+ # username: 'myself',
14
+ # password: 'secret'
15
+ # },
16
+ # remote_path: '/',
17
+ # pattern: /^some_file_\d{14}\.csv/,
18
+ # most_recent_only: true
19
+ # )
20
+ #
21
+ # parser Remi::Parser::CsvFile.new(
22
+ # csv_options: {
23
+ # headers: true,
24
+ # col_sep: ','
25
+ # }
26
+ # )
27
+ # end
28
+ # end
29
+ #
30
+ # job = MyJob.new
31
+ # job.some_file.df
32
+ # # =>#<Daru::DataFrame:70153153438500 @name = 4c59cfdd-7de7-4264-8666-83153f46a9e4 @size = 3>
33
+ # # id name
34
+ # # 0 1 Albert
35
+ # # 1 2 Betsy
36
+ # # 2 3 Camu
37
+ class Extractor::SftpFile < Extractor::FileSystem
38
+ N_RETRY = 3
39
+
40
+ # @param credentials [Hash] Options hash containing login credentials
41
+ # @param credentials [String] :host SFTP host (e.g., coolserver.com)
42
+ # @param credentials [String] :username SFTP username
43
+ # @param credentials [String] :password SFTP password
44
+ # @param credentials [String] :port SFTP port (default: 22)
45
+ def initialize(*args, **kargs, &block)
46
+ super
47
+ init_sftp_extractor(*args, **kargs)
48
+ end
49
+
50
+ attr_reader :host
51
+ attr_reader :username
52
+ attr_reader :password
53
+ attr_reader :port
54
+
55
+ # Called to extract files from the source filesystem.
56
+ # @return [Array<String>] An array of paths to a local copy of the files extacted
57
+ def extract
58
+ connection do |sftp|
59
+ entries.map do |entry|
60
+ local_file = File.join(@local_path, entry.name)
61
+ logger.info "Downloading #{entry.name} to #{local_file}"
62
+ retry_download { sftp.download!(File.join(@remote_path, entry.name), local_file) }
63
+ local_file
64
+ end
65
+ end
66
+ end
67
+
68
+ # @return [Array<Extractor::FileSystemEntry>] (Memoized) list of objects in the bucket/prefix
69
+ def all_entries
70
+ @all_entries ||= all_entries!
71
+ end
72
+
73
+ # @return [Array<Extractor::FileSystemEntry>] (Memoized) list of objects in the bucket/prefix
74
+ def all_entries!
75
+ sftp_entries = connection { |sftp| sftp.dir.entries(@remote_path) }
76
+ sftp_entries.map do |entry|
77
+ # Early versions of the protocol don't support create time, fake it with modified time?
78
+ FileSystemEntry.new(
79
+ pathname: File.join(@remote_path, entry.name),
80
+ create_time: entry.attributes.respond_to?(:createtime) ? entry.attributes.createtime : entry.attributes.mtime,
81
+ modified_time: entry.attributes.mtime
82
+ )
83
+ end
84
+ end
85
+
86
+
87
+ private
88
+
89
+ def init_sftp_extractor(*args, credentials:, **kargs)
90
+ @host = credentials.fetch(:host)
91
+ @username = credentials.fetch(:username)
92
+ @password = credentials.fetch(:password)
93
+ @port = credentials.fetch(:port, '22')
94
+ end
95
+
96
+ def connection(&block)
97
+ result = nil
98
+ Net::SFTP.start(@host, @username, password: @password, port: @port) do |sftp|
99
+ result = yield sftp
100
+ end
101
+ result
102
+ end
103
+
104
+ def retry_download(&block)
105
+ 1.upto(N_RETRY).each do |itry|
106
+ begin
107
+ block.call
108
+ break
109
+ rescue RuntimeError => err
110
+ raise err unless itry < N_RETRY
111
+ logger.error "Download failed with error: #{err.message}"
112
+ logger.error "Retry attempt #{itry}/#{N_RETRY-1}"
113
+ sleep(1)
114
+ end
115
+ end
116
+ end
117
+ end
118
+
119
+
120
+
121
+ # SFTP file loader
122
+ #
123
+ # @example
124
+ # class MyJob < Remi::Job
125
+ # target :my_target do
126
+ # encoder Remi::Encoder::CsvFile.new(
127
+ # csv_options: { col_sep: '|' }
128
+ # )
129
+ # loader Remi::Loader::SftpFile.new(
130
+ # credentials: { },
131
+ # remote_path: 'some_test.csv'
132
+ # )
133
+ # loader Remi::Loader::SftpFile.new(
134
+ # credentials: { },
135
+ # remote_path: 'some_other_test.csv'
136
+ # )
137
+ # end
138
+ # end
139
+ #
140
+ # my_df = Daru::DataFrame.new({ a: 1.upto(5).to_a, b: 6.upto(10) })
141
+ # job = MyJob.new
142
+ # job.my_target.df = my_df
143
+ # job.my_target.load
144
+ class Loader::SftpFile < Loader
145
+
146
+ # @param remote_path [String, Pathname] Full path to the file to be created on the target filesystem
147
+ def initialize(*args, **kargs, &block)
148
+ super
149
+ init_sftp_loader(*args, **kargs, &block)
150
+ end
151
+
152
+ attr_reader :remote_path
153
+
154
+ # Copies data to the SFTP Server
155
+ # @param data [Object] The path to the file in the temporary work location
156
+ # @return [true] On success
157
+ def load(data)
158
+ logger.info "Uploading #{data} to #{@credentials[:username]}@#{@credentials[:host]}: #{@remote_path}"
159
+ connection do |sftp|
160
+ retry_upload { sftp.upload! data, @remote_path }
161
+ end
162
+
163
+ true
164
+ end
165
+
166
+
167
+ private
168
+
169
+ def init_sftp_loader(*args, credentials:, remote_path:, **kargs, &block)
170
+ @credentials = credentials
171
+ @remote_path = remote_path
172
+ end
173
+
174
+ def connection(&block)
175
+ result = nil
176
+ Net::SFTP.start(@credentials[:host], @credentials[:username], password: @credentials[:password], port: @credentials[:port] || '22') do |sftp|
177
+ result = yield sftp
178
+ end
179
+ result
180
+ end
181
+
182
+ def retry_upload(ntry=2, &block)
183
+ 1.upto(ntry).each do |itry|
184
+ begin
185
+ block.call
186
+ break
187
+ rescue RuntimeError => err
188
+ raise err unless itry < ntry
189
+ logger.error "Upload failed with error: #{err.message}"
190
+ logger.error "Retry attempt #{itry}/#{ntry-1}"
191
+ sleep(1)
192
+ end
193
+ end
194
+ end
195
+ end
196
+ end
@@ -0,0 +1,50 @@
1
+ module Remi
2
+ class Extractor::SubJob < Extractor
3
+
4
+ # @param sub_job [Object] The name (relative to parent job) of the subjob to use
5
+ # @param data_subject [Symbol] The name (relatvie to the sub job) of the sub job's data frame
6
+ def initialize(*args, **kargs, &block)
7
+ super
8
+ init_sub_job_extractor(*args, **kargs, &block)
9
+ end
10
+
11
+ attr_accessor :sub_job, :data_subject
12
+
13
+ def extract
14
+ sub_job.job.send(data_subject).df
15
+ end
16
+
17
+ private
18
+
19
+ def init_sub_job_extractor(*args, sub_job:, data_subject:, **kargs, &block)
20
+ @sub_job = sub_job
21
+ @data_subject = data_subject
22
+ end
23
+
24
+ end
25
+
26
+ class Loader::SubJob < Loader
27
+ # @param sub_job [Object] The name (relative to parent job) of the subjob to use
28
+ # @param data_subject [Symbol] The name (relatvie to the sub job) of the sub job's data frame
29
+ def initialize(*args, **kargs, &block)
30
+ super
31
+ init_sub_job_loader(*args, **kargs, &block)
32
+ end
33
+
34
+ attr_accessor :sub_job, :data_subject
35
+
36
+ # @param data_frame [Object] Data frame to load to target sub job data subject
37
+ # @return [true] On success
38
+ def load(data_frame)
39
+ sub_job.job.send(data_subject).df = data_frame
40
+ true
41
+ end
42
+
43
+ private
44
+
45
+ def init_sub_job_loader(*args, sub_job:, data_subject:, **kargs, &block)
46
+ @sub_job = sub_job
47
+ @data_subject = data_subject
48
+ end
49
+ end
50
+ end
@@ -0,0 +1,74 @@
1
+ module Remi
2
+
3
+ # @api private
4
+ #
5
+ # A namespace for functions relating to the execution of a block against a
6
+ # proxy object.
7
+ #
8
+ # Much of this code was borrowed from [Docile](https://github.com/ms-ati/docile)
9
+ # and was modified to support different fallback contexts.
10
+ # @see Docile [Docile](https://github.com/ms-ati/docile)
11
+
12
+ module Dsl
13
+ # Execute a block in the context of an object whose methods represent the
14
+ # commands in a DSL, using a specific proxy class.
15
+ #
16
+ # @param dsl [Object] context object whose methods make up the
17
+ # (initial) DSL
18
+ # @param fallback_dsl [Object] context object that the DSL should fall back
19
+ # to if the primary context fails to resolve
20
+ # @param proxy_type [FallbackContextProxy, ChainingFallbackContextProxy]
21
+ # which class to instantiate as proxy context
22
+ # @param args [Array] arguments to be passed to the block
23
+ # @param block [Proc] the block of DSL commands to be executed
24
+ # @return [Object] the return value of the block
25
+
26
+ def exec_in_proxy_context(dsl, fallback_dsl, proxy_type, *args, &block)
27
+ block_context = fallback_dsl
28
+ proxy_context = proxy_type.new(dsl, block_context)
29
+ begin
30
+ block_context.instance_variables.each do |ivar|
31
+ value_from_block = block_context.instance_variable_get(ivar)
32
+ proxy_context.instance_variable_set(ivar, value_from_block)
33
+ end
34
+ proxy_context.instance_exec(*args, &block)
35
+ ensure
36
+ block_context.instance_variables.each do |ivar|
37
+ value_from_dsl_proxy = proxy_context.instance_variable_get(ivar)
38
+ block_context.instance_variable_set(ivar, value_from_dsl_proxy)
39
+ end
40
+ end
41
+ end
42
+ module_function :exec_in_proxy_context
43
+
44
+
45
+ # Execute a block in the context of an object whose methods represent the
46
+ # commands in a DSL.
47
+ #
48
+ # @note Use with an *imperative* DSL (commands modify the context object)
49
+ #
50
+ # Use this method to execute an *imperative* DSL, which means that:
51
+ #
52
+ # 1. Each command mutates the state of the DSL context object
53
+ # 2. The return value of each command is ignored
54
+ # 3. The final return value is the original context object
55
+ #
56
+ #
57
+ # @param dsl [Object] context object whose methods make up the DSL
58
+ # @param fallback_dsl [Object] context object that the DSL should fallback to
59
+ # @param args [Array] arguments to be passed to the block
60
+ # @param block [Proc] the block of DSL commands to be executed against the
61
+ # `dsl` context object
62
+ # @return [Object] the `dsl` context object after executing the block
63
+ def dsl_eval(dsl, fallback_dsl, *args, &block)
64
+ exec_in_proxy_context(dsl, fallback_dsl, Docile::FallbackContextProxy, *args, &block)
65
+ dsl
66
+ end
67
+ module_function :dsl_eval
68
+
69
+ def dsl_return(dsl, fallback_dsl, *args, &block)
70
+ exec_in_proxy_context(dsl, fallback_dsl, Docile::FallbackContextProxy, *args, &block)
71
+ end
72
+ module_function :dsl_return
73
+ end
74
+ end
@@ -0,0 +1,45 @@
1
+ module Remi
2
+ # An encoder is an object tha converts a dataframe into a form that can
3
+ # be used by a Remi::Loader. This is a parent class meant to be
4
+ # inherited by child classes that define specific ways to parse
5
+ # data.
6
+ class Encoder
7
+
8
+ # @param context [Object] The context (e.g., DataTarget) for the encoder (default: `nil`)
9
+ # @param field_symbolizer [Proc] The field symbolizer to use for this encoder
10
+ # @param fields [Remi::Fields] A hash of field metadata to be used by the encoder
11
+ def initialize(*args, context: nil, field_symbolizer: Remi::FieldSymbolizers[:standard], fields: Remi::Fields.new({}), logger: Remi::Settings.logger, **kargs, &block)
12
+ @context = context
13
+ @field_symbolizer = field_symbolizer
14
+
15
+ @fields = fields
16
+ @logger = logger
17
+ end
18
+
19
+ attr_accessor :context
20
+ attr_accessor :logger
21
+ attr_writer :field_symbolizer
22
+ attr_writer :fields
23
+
24
+ # Any child classes need to define an encode method that converts the
25
+ # data subject's dataframe into a structure that can be loaded into the
26
+ # target system.
27
+ # @param dataframe [Remi::DataFrame] The dataframe to be encoded
28
+ # @return [Object] The encoded data to be loaded into the target
29
+ def encode(dataframe)
30
+ raise NoMethodError, "#{__method__} not defined for #{self.class.name}"
31
+ end
32
+
33
+ # @return [Proc] The field symbolizer (uses the context field symbolizer if defined)
34
+ def field_symbolizer
35
+ return context.field_symbolizer if context if context.respond_to? :field_symbolizer
36
+ @field_symbolizer
37
+ end
38
+
39
+ # @return [Remi::Fields] The fields (uses the context fields if defined)
40
+ def fields
41
+ return context.fields if context if context.respond_to? :fields
42
+ @fields
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,21 @@
1
+ module Remi
2
+ # An extractor is an object meant to extract data from some external system.
3
+ # This is a parent class meant to be inherited by child classes that
4
+ # define specific ways to extract data.
5
+ class Extractor
6
+
7
+ def initialize(*args, logger: Remi::Settings.logger, **kargs, &block)
8
+ @logger = logger
9
+ end
10
+
11
+ # @return [Object] The logger object used by the extractor
12
+ attr_accessor :logger
13
+
14
+ # Any child classes need to define an extract method that returns data
15
+ # in a format that an appropriate parser can use to convert into a dataframe
16
+ def extract
17
+ raise NoMethodError, "#{__method__} not defined for #{self.class.name}"
18
+ end
19
+
20
+ end
21
+ end
@@ -12,6 +12,7 @@ module Remi
12
12
  gsub(/[^0-9a-zA-Z_.]+/, "").to_sym
13
13
  }
14
14
  }
15
+
15
16
  end
16
17
  end
17
18
  end
@@ -1,176 +1,342 @@
1
1
  module Remi
2
- module Job
3
- module JobClassMethods
4
- attr_accessor :params
5
- attr_accessor :sources
6
- attr_accessor :targets
7
- attr_accessor :transforms
8
2
 
9
- def define_param(key, value)
10
- @params ||= Hash.new { |h, key| raise "Parameter #{key} is not defined" }
11
- @params[key] = value
3
+ # The Job class is the foundation for all Remi ETL jobs. It
4
+ # provides a DSL for defining Remi jobs in a way that is natural for
5
+ # ETL style applications. In a Remi job, the user defines all of
6
+ # the sources, transforms, and targets necessary to transform data.
7
+ # Any number of sources, transforms, and targets can be defined.
8
+ # Transforms can call other parameterized sub-transforms. Jobs can
9
+ # collect data from other parameterized sub-jobs, pass data to other
10
+ # sub-jobs, or both pass and collect data from other sub-jobs.
11
+ #
12
+ # Jobs are executed by calling the `#execute` method in an instance
13
+ # of the job. This triggers all transforms to be executed in the
14
+ # order they are defined. Sub-transforms are only executed if they
15
+ # are referenced in a transform. After all transforms have
16
+ # executed, the targets are loaded in the order they are defined.
17
+ #
18
+ #
19
+ #
20
+ # @example
21
+ #
22
+ # class MyJob < Remi::Job
23
+ # source :my_csv_file do
24
+ # extractor my_extractor
25
+ # parser my_parser
26
+ # enforce_types
27
+ # end
28
+ #
29
+ # target :my_transformed_file do
30
+ # loader my_loader
31
+ # end
32
+ #
33
+ # transform :transform_data do
34
+ # # Data sources are converted into a dataframe the first time the #df method is called.
35
+ # transform_work = my_csv_file.df.dup # => a copy of the my_csv_file.df dataframe
36
+ #
37
+ # # Any arbitrary Ruby is allowed in a transform block. Remi provides a convenient
38
+ # # source to target map DSL to map fields from sources to targets
39
+ # Remi::SourceToTargetMap.apply(transform_work, my_transformed_file.df) do
40
+ # map source(:source_field_id) .target(:prefixed_id)
41
+ # .transform(->(v) { "PREFIX#{v}" })
42
+ # end
43
+ # end
44
+ # end
45
+ #
46
+ # # The job is executed when `#execute` is called on an instance of the job.
47
+ # # Transforms are executed in the order they are defined. Targets are loaded
48
+ # # in the order they are defined after all transforms have been executed.
49
+ # job = MyJob.new
50
+ # job.execute
51
+ #
52
+ #
53
+ #
54
+ # @todo MOAR Examples! Subtransforms, subjobs, parameters, references to even more
55
+ # complete sample jobs.
56
+ class Job
57
+ class << self
58
+
59
+ def inherited(base)
60
+ base.instance_variable_set(:@params, params.clone)
61
+ base.instance_variable_set(:@sources, sources.dup)
62
+ base.instance_variable_set(:@targets, targets.dup)
63
+ base.instance_variable_set(:@transforms, transforms.dup)
64
+ base.instance_variable_set(:@sub_jobs, sub_jobs.dup)
12
65
  end
13
66
 
14
- def define_source(name, type_class, **options)
67
+ # @return [Job::Parameters] all parameters defined at the class level
68
+ def params
69
+ @params ||= Parameters.new
70
+ end
71
+
72
+ # Defines a job parameter.
73
+ # @example
74
+ #
75
+ # class MyJob < Job
76
+ # param(:my_param) { 'the best parameter' }
77
+ # end
78
+ #
79
+ # job = MyJob.new
80
+ # job.params[:my_param] #=> 'the best parameter'
81
+ def param(name, &block)
82
+ params.__define__(name, &block)
83
+ end
84
+
85
+ # @return [Array<Symbol>] the list of data source names
86
+ def sources
15
87
  @sources ||= []
16
- @sources << name unless @sources.include? name
88
+ end
17
89
 
18
- define_method(name) do
19
- iv_name = instance_variable_get("@#{name}")
20
- return iv_name if iv_name
21
90
 
22
- source = type_class.new(options)
91
+ # @return [Array<Symbol>] the list of sub-jobs
92
+ def sub_jobs
93
+ @sub_jobs ||= []
94
+ end
95
+
96
+ # Defines a sub job resource for this job.
97
+ # Note that the return value of the DSL block must be an instance of a Remi::Job
98
+ # @example
99
+ #
100
+ # class MyJob < Job
101
+ # sub_job(:my_sub_job) { MySubJob.new }
102
+ # end
103
+ #
104
+ # job = MyJob.new
105
+ # job.sub_job.job #=> An instance of MySubJob
106
+ def sub_job(name, &block)
107
+ sub_jobs << name unless sub_jobs.include? name
108
+ attr_accessor name
109
+
110
+ define_method("__init_#{name}__".to_sym) do
111
+ sub_job = Job::SubJob.new(self, name: name, &block)
112
+ instance_variable_set("@#{name}", sub_job)
113
+ end
114
+ end
115
+
116
+ # Defines a data source.
117
+ # @example
118
+ #
119
+ # class MyJob < Job
120
+ # source :my_source do
121
+ # extractor my_extractor
122
+ # parser my_parser
123
+ # end
124
+ # end
125
+ #
126
+ # job = MyJob.new
127
+ # job.my_source.df #=> a dataframe generated after extracting and parsing
128
+ def source(name, &block)
129
+ sources << name unless sources.include? name
130
+ attr_accessor name
131
+
132
+ define_method("__init_#{name}__".to_sym) do
133
+ source = DataSource.new(self, name: name, &block)
23
134
  instance_variable_set("@#{name}", source)
24
135
  end
25
136
  end
26
137
 
27
- def define_target(name, type_class, **options)
138
+ # @return [Array<Symbol>] the list of data target names
139
+ def targets
28
140
  @targets ||= []
29
- @targets << name unless @targets.include? name
30
-
31
- define_method(name) do
32
- iv_name = instance_variable_get("@#{name}")
33
- return iv_name if iv_name
141
+ end
34
142
 
35
- target = type_class.new(options)
143
+ # Defines a data target.
144
+ # @example
145
+ #
146
+ # class MyJob < Job
147
+ # target :my_target do
148
+ # extractor my_extractor
149
+ # parser my_parser
150
+ # end
151
+ # end
152
+ #
153
+ # job = MyJob.new
154
+ # job.my_target.df #=> a dataframe generated after extracting and parsing
155
+ def target(name, &block)
156
+ targets << name unless targets.include? name
157
+ attr_accessor name
158
+
159
+ define_method("__init_#{name}__".to_sym) do
160
+ target = DataTarget.new(self, name: name, &block)
36
161
  instance_variable_set("@#{name}", target)
37
162
  end
38
163
  end
39
164
 
40
- def define_transform(name, sources: [], targets: [], &block)
41
- @transforms ||= {}
42
- @transforms[name] = { sources: Array(sources), targets: Array(targets) }
165
+ # @return [Array<Symbol>] the list of transform names
166
+ def transforms
167
+ @transforms ||= []
168
+ end
43
169
 
44
- define_method(name) do
45
- instance_eval { @logger.info "Running transformation #{__method__}" }
46
- instance_eval(&block)
170
+ # Defines a transform.
171
+ # @example
172
+ #
173
+ # class MyJob < Job
174
+ # transform :my_transform do
175
+ # puts "hello from my_transform!"
176
+ # end
177
+ # end
178
+ #
179
+ # job = MyJob.new
180
+ # job.my_transform.execute #=>(stdout) 'hello from my_transform!'
181
+ def transform(name, &block)
182
+ transforms << name unless transforms.include? name
183
+ attr_accessor name
184
+
185
+ define_method("__init_#{name}__".to_sym) do
186
+ transform = Transform.new(self, name: name, &block)
187
+ instance_variable_set("@#{name}", transform)
47
188
  end
48
189
  end
49
190
 
50
- def params
51
- @params || {}
191
+ # Defines a sub-transform.
192
+ # @example
193
+ #
194
+ # class MyJob < Job
195
+ # sub_transform :my_sub_transform, greeting: 'hello' do
196
+ # puts "#{params[:greeting]} from my_sub_transform!"
197
+ # end
198
+ #
199
+ # transform :my_transform do
200
+ # import :my_sub_transform, greeting: 'bonjour' do
201
+ # end
202
+ # end
203
+ # end
204
+ #
205
+ # job = MyJob.new
206
+ # job.my_transform.execute #=>(stdout) 'bonjour from my_sub_transform!'
207
+ def sub_transform(name, **kargs, &block)
208
+ define_method(name) do
209
+ Transform.new(self, name: name, **kargs, &block)
210
+ end
52
211
  end
212
+ end
53
213
 
54
- def sources
55
- @sources || []
56
- end
214
+ # Initializes the job
215
+ #
216
+ # @param work_dir [String, Path] sets the working directory for this job
217
+ # @param logger [Object] sets the logger for the job
218
+ # @param kargs [Hash] Optional job parameters (can be referenced in the job via `#params`)
219
+ def initialize(work_dir: Settings.work_dir, logger: Settings.logger, **kargs)
220
+ @work_dir = work_dir
221
+ @logger = logger
222
+ create_work_dir
57
223
 
58
- def targets
59
- @targets || []
60
- end
224
+ __init_params__ **kargs
225
+ __init_sub_jobs__
226
+ __init_sources__
227
+ __init_targets__
228
+ __init_transforms__
229
+ end
61
230
 
62
- def transforms
63
- @transforms || {}
64
- end
231
+ # @return [String] the working directory used for temporary data
232
+ attr_reader :work_dir
65
233
 
234
+ # @return [Object] the logging object
235
+ attr_reader :logger
66
236
 
67
- def work_dir
68
- Settings.work_dir
69
- end
237
+ # @return [Job::Parameters] parameters defined at the class level or during instantiation
238
+ attr_reader :params
70
239
 
71
- def self.extended(receiver)
72
- end
240
+ # @return [Array] list of sub_jobs defined in the job
241
+ attr_reader :sub_jobs
73
242
 
74
- def included(receiver)
75
- receiver.extend(JobClassMethods)
76
- receiver.params = self.params.merge(receiver.params)
77
- receiver.sources = self.sources + receiver.sources
78
- receiver.targets = self.targets + receiver.targets
79
- receiver.transforms = self.transforms.merge(receiver.transforms)
80
- end
81
- end
243
+ # @return [Array] list of sources defined in the job
244
+ attr_reader :sources
82
245
 
83
- def self.included(receiver)
84
- receiver.extend(JobClassMethods)
85
- end
246
+ # @return [Array] list of targets defined in the job
247
+ attr_reader :targets
86
248
 
249
+ # @return [Array] list of transforms defined in the job
250
+ attr_reader :transforms
87
251
 
88
- def params
89
- self.class.params
90
- end
91
252
 
92
- def sources
93
- self.class.sources
253
+ # Creates a temporary working directory for the job
254
+ def create_work_dir
255
+ @logger.info "Creating working directory #{work_dir}"
256
+ FileUtils.mkdir_p work_dir
94
257
  end
95
258
 
96
- def targets
97
- self.class.targets
259
+
260
+ # @return [self] the job object (needed to reference parent job in transform DSL)
261
+ def job
262
+ self
98
263
  end
99
264
 
100
- def transforms
101
- self.class.transforms
265
+ def to_s
266
+ inspect
102
267
  end
103
268
 
269
+ def inspect
270
+ "#<#{Remi::Job}>: #{self.class}\n" +
271
+ " parameters: #{params.to_h.keys}\n" +
272
+ " sources: #{sources}\n" +
273
+ " targets: #{targets}\n" +
274
+ " transforms: #{transforms}"
275
+ end
104
276
 
105
277
 
106
- def initialize(runtime_params: {}, delete_work_dir: true, logger: Settings.logger)
107
- @runtime_params = runtime_params
108
- @delete_work_dir = delete_work_dir
109
- @logger = logger
110
- create_work_dir
278
+ # Execute the specified components of the job.
279
+ #
280
+ # @param components [Array<symbol>] list of components to execute (e.g., `:transforms`, `:load_targets`)
281
+ #
282
+ # @return [self]
283
+ def execute(*components)
284
+ execute_transforms if components.empty? || components.include?(:transforms)
285
+ execute_load_targets if components.empty? || components.include?(:load_targets)
286
+ self
111
287
  end
112
288
 
113
- attr_accessor :runtime_params
289
+ private
114
290
 
115
- def work_dir
116
- self.class.work_dir
291
+ def __init_params__(**kargs)
292
+ @params = self.class.params.clone
293
+ add_params **kargs
294
+ params.context = self
117
295
  end
118
296
 
119
- def finalize
120
- delete_work_dir
297
+ def __init_sub_jobs__
298
+ @sub_jobs = self.class.sub_jobs
299
+ @sub_jobs.each do |sub_job|
300
+ send("__init_#{sub_job}__".to_sym)
301
+ end
121
302
  end
122
303
 
123
- def delete_work_dir
124
- if @delete_work_dir && (work_dir.match /^#{Dir.tmpdir}/)
125
- @logger.info "Deleting temporary directory #{work_dir}"
126
- FileUtils.rm_r work_dir
127
- else
128
- @logger.debug "Not going to delete working directory #{work_dir}"
129
- nil
304
+ def __init_sources__
305
+ @sources = self.class.sources
306
+ @sources.each do |source|
307
+ send("__init_#{source}__".to_sym)
130
308
  end
131
309
  end
132
310
 
133
- def create_work_dir
134
- @logger.info "Creating working directory #{work_dir}"
135
- FileUtils.mkdir_p work_dir
311
+ def __init_targets__
312
+ @targets = self.class.targets
313
+ @targets.each do |target|
314
+ send("__init_#{target}__".to_sym)
315
+ end
136
316
  end
137
317
 
138
- # Public: Runs any transforms that use the sources and targets selected. If
139
- # source and target is not specified, then all transforms will be run.
140
- # If only the source is specified, then all transforms that use any of the
141
- # sources will be run. Same for specified transforms.
142
- #
143
- # sources - Array of source names
144
- # targets - Array of target names
145
- #
146
- # Returns an array containing the result of each transform.
147
- def run_transforms_using(sources: nil, targets: nil)
148
- transforms.map do |t, st|
149
- selected_sources = (st[:sources] & Array(sources || st[:sources])).size > 0
150
- selected_targets = (st[:targets] & Array(targets || st[:targets])).size > 0
151
- self.send(t) if selected_sources && selected_targets
318
+ def __init_transforms__
319
+ @transforms = self.class.transforms
320
+ @transforms.each do |transform|
321
+ send("__init_#{transform}__".to_sym)
152
322
  end
153
323
  end
154
324
 
155
- def run_all_transforms
156
- transforms.map { |t, st| self.send(t) }
325
+ # Executes all transforms defined
326
+ def execute_transforms
327
+ transforms.map { |t| send(t).execute }
328
+ self
157
329
  end
158
330
 
159
- def load_all_targets
160
- targets.each do |target|
161
- @logger.info "Loading target #{target}"
162
- self.send(target).tap { |t| t.respond_to?(:load) ? t.load : nil }
163
- end
331
+ # Loads all targets defined
332
+ def execute_load_targets
333
+ targets.each { |t| send(t).load }
334
+ self
164
335
  end
165
336
 
166
- # Public: Runs all transforms defined in the job.
167
- #
168
- # Returns the job instance.
169
- def run
170
- # Do all of the stuff here
171
- run_all_transforms
172
- load_all_targets
173
- self
337
+ # Adds all parameters listed to the job parameters
338
+ def add_params(**kargs)
339
+ kargs.each { |k,v| params[k] = v }
174
340
  end
175
341
  end
176
342
  end