remi 0.2.42 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. checksums.yaml +4 -4
  2. data/.yardopts +7 -0
  3. data/Gemfile +1 -1
  4. data/Gemfile.lock +13 -26
  5. data/README.md +1 -1
  6. data/features/step_definitions/remi_step.rb +33 -13
  7. data/features/sub_job_example.feature +24 -0
  8. data/features/sub_transform_example.feature +35 -0
  9. data/features/sub_transform_many_to_many.feature +49 -0
  10. data/features/support/env_app.rb +1 -1
  11. data/jobs/all_jobs_shared.rb +19 -16
  12. data/jobs/copy_source_job.rb +11 -9
  13. data/jobs/csv_file_target_job.rb +10 -9
  14. data/jobs/json_job.rb +18 -14
  15. data/jobs/metadata_job.rb +33 -28
  16. data/jobs/parameters_job.rb +14 -11
  17. data/jobs/sample_job.rb +106 -77
  18. data/jobs/sftp_file_target_job.rb +14 -13
  19. data/jobs/sub_job_example_job.rb +86 -0
  20. data/jobs/sub_transform_example_job.rb +43 -0
  21. data/jobs/sub_transform_many_to_many_job.rb +46 -0
  22. data/jobs/transforms/concatenate_job.rb +16 -12
  23. data/jobs/transforms/data_frame_sieve_job.rb +24 -19
  24. data/jobs/transforms/date_diff_job.rb +15 -11
  25. data/jobs/transforms/nvl_job.rb +16 -12
  26. data/jobs/transforms/parse_date_job.rb +17 -14
  27. data/jobs/transforms/partitioner_job.rb +27 -19
  28. data/jobs/transforms/prefix_job.rb +13 -10
  29. data/jobs/transforms/truncate_job.rb +14 -10
  30. data/jobs/transforms/truthy_job.rb +11 -8
  31. data/lib/remi.rb +25 -11
  32. data/lib/remi/data_frame.rb +4 -4
  33. data/lib/remi/data_frame/daru.rb +1 -37
  34. data/lib/remi/data_subject.rb +234 -48
  35. data/lib/remi/data_subjects/csv_file.rb +171 -0
  36. data/lib/remi/data_subjects/data_frame.rb +106 -0
  37. data/lib/remi/data_subjects/file_system.rb +115 -0
  38. data/lib/remi/data_subjects/local_file.rb +109 -0
  39. data/lib/remi/data_subjects/none.rb +31 -0
  40. data/lib/remi/data_subjects/postgres.rb +186 -0
  41. data/lib/remi/data_subjects/s3_file.rb +84 -0
  42. data/lib/remi/data_subjects/salesforce.rb +211 -0
  43. data/lib/remi/data_subjects/sftp_file.rb +196 -0
  44. data/lib/remi/data_subjects/sub_job.rb +50 -0
  45. data/lib/remi/dsl.rb +74 -0
  46. data/lib/remi/encoder.rb +45 -0
  47. data/lib/remi/extractor.rb +21 -0
  48. data/lib/remi/field_symbolizers.rb +1 -0
  49. data/lib/remi/job.rb +279 -113
  50. data/lib/remi/job/parameters.rb +90 -0
  51. data/lib/remi/job/sub_job.rb +35 -0
  52. data/lib/remi/job/transform.rb +165 -0
  53. data/lib/remi/loader.rb +22 -0
  54. data/lib/remi/monkeys/daru.rb +4 -0
  55. data/lib/remi/parser.rb +44 -0
  56. data/lib/remi/testing/business_rules.rb +17 -23
  57. data/lib/remi/testing/data_stub.rb +2 -2
  58. data/lib/remi/version.rb +1 -1
  59. data/remi.gemspec +3 -0
  60. data/spec/data_subject_spec.rb +475 -11
  61. data/spec/data_subjects/csv_file_spec.rb +69 -0
  62. data/spec/data_subjects/data_frame_spec.rb +52 -0
  63. data/spec/{extractor → data_subjects}/file_system_spec.rb +0 -0
  64. data/spec/{extractor → data_subjects}/local_file_spec.rb +0 -0
  65. data/spec/data_subjects/none_spec.rb +41 -0
  66. data/spec/data_subjects/postgres_spec.rb +80 -0
  67. data/spec/{extractor → data_subjects}/s3_file_spec.rb +0 -0
  68. data/spec/data_subjects/salesforce_spec.rb +117 -0
  69. data/spec/{extractor → data_subjects}/sftp_file_spec.rb +16 -0
  70. data/spec/data_subjects/sub_job_spec.rb +33 -0
  71. data/spec/encoder_spec.rb +38 -0
  72. data/spec/extractor_spec.rb +11 -0
  73. data/spec/fixtures/sf_bulk_helper_stubs.rb +443 -0
  74. data/spec/job/transform_spec.rb +257 -0
  75. data/spec/job_spec.rb +507 -0
  76. data/spec/loader_spec.rb +11 -0
  77. data/spec/parser_spec.rb +38 -0
  78. data/spec/sf_bulk_helper_spec.rb +117 -0
  79. data/spec/testing/data_stub_spec.rb +5 -3
  80. metadata +109 -27
  81. data/features/aggregate.feature +0 -42
  82. data/jobs/aggregate_job.rb +0 -31
  83. data/jobs/transforms/transform_jobs.rb +0 -4
  84. data/lib/remi/data_subject/csv_file.rb +0 -162
  85. data/lib/remi/data_subject/data_frame.rb +0 -52
  86. data/lib/remi/data_subject/postgres.rb +0 -134
  87. data/lib/remi/data_subject/salesforce.rb +0 -136
  88. data/lib/remi/data_subject/sftp_file.rb +0 -65
  89. data/lib/remi/extractor/file_system.rb +0 -92
  90. data/lib/remi/extractor/local_file.rb +0 -43
  91. data/lib/remi/extractor/s3_file.rb +0 -57
  92. data/lib/remi/extractor/sftp_file.rb +0 -83
  93. data/spec/data_subject/csv_file_spec.rb +0 -79
  94. data/spec/data_subject/data_frame.rb +0 -27
@@ -0,0 +1,196 @@
1
+ module Remi
2
+
3
+ # Sftp File extractor
4
+ # Used to extract files from an SFTP server
5
+ #
6
+ # @example
7
+ #
8
+ # class MyJob < Remi::Job
9
+ # source :some_file do
10
+ # extractor Remi::Extractor::SftpFile.new(
11
+ # credentials: {
12
+ # host: 'coolserver.com',
13
+ # username: 'myself',
14
+ # password: 'secret'
15
+ # },
16
+ # remote_path: '/',
17
+ # pattern: /^some_file_\d{14}\.csv/,
18
+ # most_recent_only: true
19
+ # )
20
+ #
21
+ # parser Remi::Parser::CsvFile.new(
22
+ # csv_options: {
23
+ # headers: true,
24
+ # col_sep: ','
25
+ # }
26
+ # )
27
+ # end
28
+ # end
29
+ #
30
+ # job = MyJob.new
31
+ # job.some_file.df
32
+ # # =>#<Daru::DataFrame:70153153438500 @name = 4c59cfdd-7de7-4264-8666-83153f46a9e4 @size = 3>
33
+ # # id name
34
+ # # 0 1 Albert
35
+ # # 1 2 Betsy
36
+ # # 2 3 Camu
37
+ class Extractor::SftpFile < Extractor::FileSystem
38
+ N_RETRY = 3
39
+
40
+ # @param credentials [Hash] Options hash containing login credentials
41
+ # @param credentials [String] :host SFTP host (e.g., coolserver.com)
42
+ # @param credentials [String] :username SFTP username
43
+ # @param credentials [String] :password SFTP password
44
+ # @param credentials [String] :port SFTP port (default: 22)
45
+ def initialize(*args, **kargs, &block)
46
+ super
47
+ init_sftp_extractor(*args, **kargs)
48
+ end
49
+
50
+ attr_reader :host
51
+ attr_reader :username
52
+ attr_reader :password
53
+ attr_reader :port
54
+
55
+ # Called to extract files from the source filesystem.
56
+ # @return [Array<String>] An array of paths to a local copy of the files extacted
57
+ def extract
58
+ connection do |sftp|
59
+ entries.map do |entry|
60
+ local_file = File.join(@local_path, entry.name)
61
+ logger.info "Downloading #{entry.name} to #{local_file}"
62
+ retry_download { sftp.download!(File.join(@remote_path, entry.name), local_file) }
63
+ local_file
64
+ end
65
+ end
66
+ end
67
+
68
+ # @return [Array<Extractor::FileSystemEntry>] (Memoized) list of objects in the bucket/prefix
69
+ def all_entries
70
+ @all_entries ||= all_entries!
71
+ end
72
+
73
+ # @return [Array<Extractor::FileSystemEntry>] (Memoized) list of objects in the bucket/prefix
74
+ def all_entries!
75
+ sftp_entries = connection { |sftp| sftp.dir.entries(@remote_path) }
76
+ sftp_entries.map do |entry|
77
+ # Early versions of the protocol don't support create time, fake it with modified time?
78
+ FileSystemEntry.new(
79
+ pathname: File.join(@remote_path, entry.name),
80
+ create_time: entry.attributes.respond_to?(:createtime) ? entry.attributes.createtime : entry.attributes.mtime,
81
+ modified_time: entry.attributes.mtime
82
+ )
83
+ end
84
+ end
85
+
86
+
87
+ private
88
+
89
+ def init_sftp_extractor(*args, credentials:, **kargs)
90
+ @host = credentials.fetch(:host)
91
+ @username = credentials.fetch(:username)
92
+ @password = credentials.fetch(:password)
93
+ @port = credentials.fetch(:port, '22')
94
+ end
95
+
96
+ def connection(&block)
97
+ result = nil
98
+ Net::SFTP.start(@host, @username, password: @password, port: @port) do |sftp|
99
+ result = yield sftp
100
+ end
101
+ result
102
+ end
103
+
104
+ def retry_download(&block)
105
+ 1.upto(N_RETRY).each do |itry|
106
+ begin
107
+ block.call
108
+ break
109
+ rescue RuntimeError => err
110
+ raise err unless itry < N_RETRY
111
+ logger.error "Download failed with error: #{err.message}"
112
+ logger.error "Retry attempt #{itry}/#{N_RETRY-1}"
113
+ sleep(1)
114
+ end
115
+ end
116
+ end
117
+ end
118
+
119
+
120
+
121
+ # SFTP file loader
122
+ #
123
+ # @example
124
+ # class MyJob < Remi::Job
125
+ # target :my_target do
126
+ # encoder Remi::Encoder::CsvFile.new(
127
+ # csv_options: { col_sep: '|' }
128
+ # )
129
+ # loader Remi::Loader::SftpFile.new(
130
+ # credentials: { },
131
+ # remote_path: 'some_test.csv'
132
+ # )
133
+ # loader Remi::Loader::SftpFile.new(
134
+ # credentials: { },
135
+ # remote_path: 'some_other_test.csv'
136
+ # )
137
+ # end
138
+ # end
139
+ #
140
+ # my_df = Daru::DataFrame.new({ a: 1.upto(5).to_a, b: 6.upto(10) })
141
+ # job = MyJob.new
142
+ # job.my_target.df = my_df
143
+ # job.my_target.load
144
+ class Loader::SftpFile < Loader
145
+
146
+ # @param remote_path [String, Pathname] Full path to the file to be created on the target filesystem
147
+ def initialize(*args, **kargs, &block)
148
+ super
149
+ init_sftp_loader(*args, **kargs, &block)
150
+ end
151
+
152
+ attr_reader :remote_path
153
+
154
+ # Copies data to the SFTP Server
155
+ # @param data [Object] The path to the file in the temporary work location
156
+ # @return [true] On success
157
+ def load(data)
158
+ logger.info "Uploading #{data} to #{@credentials[:username]}@#{@credentials[:host]}: #{@remote_path}"
159
+ connection do |sftp|
160
+ retry_upload { sftp.upload! data, @remote_path }
161
+ end
162
+
163
+ true
164
+ end
165
+
166
+
167
+ private
168
+
169
+ def init_sftp_loader(*args, credentials:, remote_path:, **kargs, &block)
170
+ @credentials = credentials
171
+ @remote_path = remote_path
172
+ end
173
+
174
+ def connection(&block)
175
+ result = nil
176
+ Net::SFTP.start(@credentials[:host], @credentials[:username], password: @credentials[:password], port: @credentials[:port] || '22') do |sftp|
177
+ result = yield sftp
178
+ end
179
+ result
180
+ end
181
+
182
+ def retry_upload(ntry=2, &block)
183
+ 1.upto(ntry).each do |itry|
184
+ begin
185
+ block.call
186
+ break
187
+ rescue RuntimeError => err
188
+ raise err unless itry < ntry
189
+ logger.error "Upload failed with error: #{err.message}"
190
+ logger.error "Retry attempt #{itry}/#{ntry-1}"
191
+ sleep(1)
192
+ end
193
+ end
194
+ end
195
+ end
196
+ end
@@ -0,0 +1,50 @@
1
+ module Remi
2
+ class Extractor::SubJob < Extractor
3
+
4
+ # @param sub_job [Object] The name (relative to parent job) of the subjob to use
5
+ # @param data_subject [Symbol] The name (relatvie to the sub job) of the sub job's data frame
6
+ def initialize(*args, **kargs, &block)
7
+ super
8
+ init_sub_job_extractor(*args, **kargs, &block)
9
+ end
10
+
11
+ attr_accessor :sub_job, :data_subject
12
+
13
+ def extract
14
+ sub_job.job.send(data_subject).df
15
+ end
16
+
17
+ private
18
+
19
+ def init_sub_job_extractor(*args, sub_job:, data_subject:, **kargs, &block)
20
+ @sub_job = sub_job
21
+ @data_subject = data_subject
22
+ end
23
+
24
+ end
25
+
26
+ class Loader::SubJob < Loader
27
+ # @param sub_job [Object] The name (relative to parent job) of the subjob to use
28
+ # @param data_subject [Symbol] The name (relatvie to the sub job) of the sub job's data frame
29
+ def initialize(*args, **kargs, &block)
30
+ super
31
+ init_sub_job_loader(*args, **kargs, &block)
32
+ end
33
+
34
+ attr_accessor :sub_job, :data_subject
35
+
36
+ # @param data_frame [Object] Data frame to load to target sub job data subject
37
+ # @return [true] On success
38
+ def load(data_frame)
39
+ sub_job.job.send(data_subject).df = data_frame
40
+ true
41
+ end
42
+
43
+ private
44
+
45
+ def init_sub_job_loader(*args, sub_job:, data_subject:, **kargs, &block)
46
+ @sub_job = sub_job
47
+ @data_subject = data_subject
48
+ end
49
+ end
50
+ end
@@ -0,0 +1,74 @@
1
+ module Remi
2
+
3
+ # @api private
4
+ #
5
+ # A namespace for functions relating to the execution of a block against a
6
+ # proxy object.
7
+ #
8
+ # Much of this code was borrowed from [Docile](https://github.com/ms-ati/docile)
9
+ # and was modified to support different fallback contexts.
10
+ # @see Docile [Docile](https://github.com/ms-ati/docile)
11
+
12
+ module Dsl
13
+ # Execute a block in the context of an object whose methods represent the
14
+ # commands in a DSL, using a specific proxy class.
15
+ #
16
+ # @param dsl [Object] context object whose methods make up the
17
+ # (initial) DSL
18
+ # @param fallback_dsl [Object] context object that the DSL should fall back
19
+ # to if the primary context fails to resolve
20
+ # @param proxy_type [FallbackContextProxy, ChainingFallbackContextProxy]
21
+ # which class to instantiate as proxy context
22
+ # @param args [Array] arguments to be passed to the block
23
+ # @param block [Proc] the block of DSL commands to be executed
24
+ # @return [Object] the return value of the block
25
+
26
+ def exec_in_proxy_context(dsl, fallback_dsl, proxy_type, *args, &block)
27
+ block_context = fallback_dsl
28
+ proxy_context = proxy_type.new(dsl, block_context)
29
+ begin
30
+ block_context.instance_variables.each do |ivar|
31
+ value_from_block = block_context.instance_variable_get(ivar)
32
+ proxy_context.instance_variable_set(ivar, value_from_block)
33
+ end
34
+ proxy_context.instance_exec(*args, &block)
35
+ ensure
36
+ block_context.instance_variables.each do |ivar|
37
+ value_from_dsl_proxy = proxy_context.instance_variable_get(ivar)
38
+ block_context.instance_variable_set(ivar, value_from_dsl_proxy)
39
+ end
40
+ end
41
+ end
42
+ module_function :exec_in_proxy_context
43
+
44
+
45
+ # Execute a block in the context of an object whose methods represent the
46
+ # commands in a DSL.
47
+ #
48
+ # @note Use with an *imperative* DSL (commands modify the context object)
49
+ #
50
+ # Use this method to execute an *imperative* DSL, which means that:
51
+ #
52
+ # 1. Each command mutates the state of the DSL context object
53
+ # 2. The return value of each command is ignored
54
+ # 3. The final return value is the original context object
55
+ #
56
+ #
57
+ # @param dsl [Object] context object whose methods make up the DSL
58
+ # @param fallback_dsl [Object] context object that the DSL should fallback to
59
+ # @param args [Array] arguments to be passed to the block
60
+ # @param block [Proc] the block of DSL commands to be executed against the
61
+ # `dsl` context object
62
+ # @return [Object] the `dsl` context object after executing the block
63
+ def dsl_eval(dsl, fallback_dsl, *args, &block)
64
+ exec_in_proxy_context(dsl, fallback_dsl, Docile::FallbackContextProxy, *args, &block)
65
+ dsl
66
+ end
67
+ module_function :dsl_eval
68
+
69
+ def dsl_return(dsl, fallback_dsl, *args, &block)
70
+ exec_in_proxy_context(dsl, fallback_dsl, Docile::FallbackContextProxy, *args, &block)
71
+ end
72
+ module_function :dsl_return
73
+ end
74
+ end
@@ -0,0 +1,45 @@
1
+ module Remi
2
+ # An encoder is an object tha converts a dataframe into a form that can
3
+ # be used by a Remi::Loader. This is a parent class meant to be
4
+ # inherited by child classes that define specific ways to parse
5
+ # data.
6
+ class Encoder
7
+
8
+ # @param context [Object] The context (e.g., DataTarget) for the encoder (default: `nil`)
9
+ # @param field_symbolizer [Proc] The field symbolizer to use for this encoder
10
+ # @param fields [Remi::Fields] A hash of field metadata to be used by the encoder
11
+ def initialize(*args, context: nil, field_symbolizer: Remi::FieldSymbolizers[:standard], fields: Remi::Fields.new({}), logger: Remi::Settings.logger, **kargs, &block)
12
+ @context = context
13
+ @field_symbolizer = field_symbolizer
14
+
15
+ @fields = fields
16
+ @logger = logger
17
+ end
18
+
19
+ attr_accessor :context
20
+ attr_accessor :logger
21
+ attr_writer :field_symbolizer
22
+ attr_writer :fields
23
+
24
+ # Any child classes need to define an encode method that converts the
25
+ # data subject's dataframe into a structure that can be loaded into the
26
+ # target system.
27
+ # @param dataframe [Remi::DataFrame] The dataframe to be encoded
28
+ # @return [Object] The encoded data to be loaded into the target
29
+ def encode(dataframe)
30
+ raise NoMethodError, "#{__method__} not defined for #{self.class.name}"
31
+ end
32
+
33
+ # @return [Proc] The field symbolizer (uses the context field symbolizer if defined)
34
+ def field_symbolizer
35
+ return context.field_symbolizer if context if context.respond_to? :field_symbolizer
36
+ @field_symbolizer
37
+ end
38
+
39
+ # @return [Remi::Fields] The fields (uses the context fields if defined)
40
+ def fields
41
+ return context.fields if context if context.respond_to? :fields
42
+ @fields
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,21 @@
1
+ module Remi
2
+ # An extractor is an object meant to extract data from some external system.
3
+ # This is a parent class meant to be inherited by child classes that
4
+ # define specific ways to extract data.
5
+ class Extractor
6
+
7
+ def initialize(*args, logger: Remi::Settings.logger, **kargs, &block)
8
+ @logger = logger
9
+ end
10
+
11
+ # @return [Object] The logger object used by the extractor
12
+ attr_accessor :logger
13
+
14
+ # Any child classes need to define an extract method that returns data
15
+ # in a format that an appropriate parser can use to convert into a dataframe
16
+ def extract
17
+ raise NoMethodError, "#{__method__} not defined for #{self.class.name}"
18
+ end
19
+
20
+ end
21
+ end
@@ -12,6 +12,7 @@ module Remi
12
12
  gsub(/[^0-9a-zA-Z_.]+/, "").to_sym
13
13
  }
14
14
  }
15
+
15
16
  end
16
17
  end
17
18
  end
@@ -1,176 +1,342 @@
1
1
  module Remi
2
- module Job
3
- module JobClassMethods
4
- attr_accessor :params
5
- attr_accessor :sources
6
- attr_accessor :targets
7
- attr_accessor :transforms
8
2
 
9
- def define_param(key, value)
10
- @params ||= Hash.new { |h, key| raise "Parameter #{key} is not defined" }
11
- @params[key] = value
3
+ # The Job class is the foundation for all Remi ETL jobs. It
4
+ # provides a DSL for defining Remi jobs in a way that is natural for
5
+ # ETL style applications. In a Remi job, the user defines all of
6
+ # the sources, transforms, and targets necessary to transform data.
7
+ # Any number of sources, transforms, and targets can be defined.
8
+ # Transforms can call other parameterized sub-transforms. Jobs can
9
+ # collect data from other parameterized sub-jobs, pass data to other
10
+ # sub-jobs, or both pass and collect data from other sub-jobs.
11
+ #
12
+ # Jobs are executed by calling the `#execute` method in an instance
13
+ # of the job. This triggers all transforms to be executed in the
14
+ # order they are defined. Sub-transforms are only executed if they
15
+ # are referenced in a transform. After all transforms have
16
+ # executed, the targets are loaded in the order they are defined.
17
+ #
18
+ #
19
+ #
20
+ # @example
21
+ #
22
+ # class MyJob < Remi::Job
23
+ # source :my_csv_file do
24
+ # extractor my_extractor
25
+ # parser my_parser
26
+ # enforce_types
27
+ # end
28
+ #
29
+ # target :my_transformed_file do
30
+ # loader my_loader
31
+ # end
32
+ #
33
+ # transform :transform_data do
34
+ # # Data sources are converted into a dataframe the first time the #df method is called.
35
+ # transform_work = my_csv_file.df.dup # => a copy of the my_csv_file.df dataframe
36
+ #
37
+ # # Any arbitrary Ruby is allowed in a transform block. Remi provides a convenient
38
+ # # source to target map DSL to map fields from sources to targets
39
+ # Remi::SourceToTargetMap.apply(transform_work, my_transformed_file.df) do
40
+ # map source(:source_field_id) .target(:prefixed_id)
41
+ # .transform(->(v) { "PREFIX#{v}" })
42
+ # end
43
+ # end
44
+ # end
45
+ #
46
+ # # The job is executed when `#execute` is called on an instance of the job.
47
+ # # Transforms are executed in the order they are defined. Targets are loaded
48
+ # # in the order they are defined after all transforms have been executed.
49
+ # job = MyJob.new
50
+ # job.execute
51
+ #
52
+ #
53
+ #
54
+ # @todo MOAR Examples! Subtransforms, subjobs, parameters, references to even more
55
+ # complete sample jobs.
56
+ class Job
57
+ class << self
58
+
59
+ def inherited(base)
60
+ base.instance_variable_set(:@params, params.clone)
61
+ base.instance_variable_set(:@sources, sources.dup)
62
+ base.instance_variable_set(:@targets, targets.dup)
63
+ base.instance_variable_set(:@transforms, transforms.dup)
64
+ base.instance_variable_set(:@sub_jobs, sub_jobs.dup)
12
65
  end
13
66
 
14
- def define_source(name, type_class, **options)
67
+ # @return [Job::Parameters] all parameters defined at the class level
68
+ def params
69
+ @params ||= Parameters.new
70
+ end
71
+
72
+ # Defines a job parameter.
73
+ # @example
74
+ #
75
+ # class MyJob < Job
76
+ # param(:my_param) { 'the best parameter' }
77
+ # end
78
+ #
79
+ # job = MyJob.new
80
+ # job.params[:my_param] #=> 'the best parameter'
81
+ def param(name, &block)
82
+ params.__define__(name, &block)
83
+ end
84
+
85
+ # @return [Array<Symbol>] the list of data source names
86
+ def sources
15
87
  @sources ||= []
16
- @sources << name unless @sources.include? name
88
+ end
17
89
 
18
- define_method(name) do
19
- iv_name = instance_variable_get("@#{name}")
20
- return iv_name if iv_name
21
90
 
22
- source = type_class.new(options)
91
+ # @return [Array<Symbol>] the list of sub-jobs
92
+ def sub_jobs
93
+ @sub_jobs ||= []
94
+ end
95
+
96
+ # Defines a sub job resource for this job.
97
+ # Note that the return value of the DSL block must be an instance of a Remi::Job
98
+ # @example
99
+ #
100
+ # class MyJob < Job
101
+ # sub_job(:my_sub_job) { MySubJob.new }
102
+ # end
103
+ #
104
+ # job = MyJob.new
105
+ # job.sub_job.job #=> An instance of MySubJob
106
+ def sub_job(name, &block)
107
+ sub_jobs << name unless sub_jobs.include? name
108
+ attr_accessor name
109
+
110
+ define_method("__init_#{name}__".to_sym) do
111
+ sub_job = Job::SubJob.new(self, name: name, &block)
112
+ instance_variable_set("@#{name}", sub_job)
113
+ end
114
+ end
115
+
116
+ # Defines a data source.
117
+ # @example
118
+ #
119
+ # class MyJob < Job
120
+ # source :my_source do
121
+ # extractor my_extractor
122
+ # parser my_parser
123
+ # end
124
+ # end
125
+ #
126
+ # job = MyJob.new
127
+ # job.my_source.df #=> a dataframe generated after extracting and parsing
128
+ def source(name, &block)
129
+ sources << name unless sources.include? name
130
+ attr_accessor name
131
+
132
+ define_method("__init_#{name}__".to_sym) do
133
+ source = DataSource.new(self, name: name, &block)
23
134
  instance_variable_set("@#{name}", source)
24
135
  end
25
136
  end
26
137
 
27
- def define_target(name, type_class, **options)
138
+ # @return [Array<Symbol>] the list of data target names
139
+ def targets
28
140
  @targets ||= []
29
- @targets << name unless @targets.include? name
30
-
31
- define_method(name) do
32
- iv_name = instance_variable_get("@#{name}")
33
- return iv_name if iv_name
141
+ end
34
142
 
35
- target = type_class.new(options)
143
+ # Defines a data target.
144
+ # @example
145
+ #
146
+ # class MyJob < Job
147
+ # target :my_target do
148
+ # extractor my_extractor
149
+ # parser my_parser
150
+ # end
151
+ # end
152
+ #
153
+ # job = MyJob.new
154
+ # job.my_target.df #=> a dataframe generated after extracting and parsing
155
+ def target(name, &block)
156
+ targets << name unless targets.include? name
157
+ attr_accessor name
158
+
159
+ define_method("__init_#{name}__".to_sym) do
160
+ target = DataTarget.new(self, name: name, &block)
36
161
  instance_variable_set("@#{name}", target)
37
162
  end
38
163
  end
39
164
 
40
- def define_transform(name, sources: [], targets: [], &block)
41
- @transforms ||= {}
42
- @transforms[name] = { sources: Array(sources), targets: Array(targets) }
165
+ # @return [Array<Symbol>] the list of transform names
166
+ def transforms
167
+ @transforms ||= []
168
+ end
43
169
 
44
- define_method(name) do
45
- instance_eval { @logger.info "Running transformation #{__method__}" }
46
- instance_eval(&block)
170
+ # Defines a transform.
171
+ # @example
172
+ #
173
+ # class MyJob < Job
174
+ # transform :my_transform do
175
+ # puts "hello from my_transform!"
176
+ # end
177
+ # end
178
+ #
179
+ # job = MyJob.new
180
+ # job.my_transform.execute #=>(stdout) 'hello from my_transform!'
181
+ def transform(name, &block)
182
+ transforms << name unless transforms.include? name
183
+ attr_accessor name
184
+
185
+ define_method("__init_#{name}__".to_sym) do
186
+ transform = Transform.new(self, name: name, &block)
187
+ instance_variable_set("@#{name}", transform)
47
188
  end
48
189
  end
49
190
 
50
- def params
51
- @params || {}
191
+ # Defines a sub-transform.
192
+ # @example
193
+ #
194
+ # class MyJob < Job
195
+ # sub_transform :my_sub_transform, greeting: 'hello' do
196
+ # puts "#{params[:greeting]} from my_sub_transform!"
197
+ # end
198
+ #
199
+ # transform :my_transform do
200
+ # import :my_sub_transform, greeting: 'bonjour' do
201
+ # end
202
+ # end
203
+ # end
204
+ #
205
+ # job = MyJob.new
206
+ # job.my_transform.execute #=>(stdout) 'bonjour from my_sub_transform!'
207
+ def sub_transform(name, **kargs, &block)
208
+ define_method(name) do
209
+ Transform.new(self, name: name, **kargs, &block)
210
+ end
52
211
  end
212
+ end
53
213
 
54
- def sources
55
- @sources || []
56
- end
214
+ # Initializes the job
215
+ #
216
+ # @param work_dir [String, Path] sets the working directory for this job
217
+ # @param logger [Object] sets the logger for the job
218
+ # @param kargs [Hash] Optional job parameters (can be referenced in the job via `#params`)
219
+ def initialize(work_dir: Settings.work_dir, logger: Settings.logger, **kargs)
220
+ @work_dir = work_dir
221
+ @logger = logger
222
+ create_work_dir
57
223
 
58
- def targets
59
- @targets || []
60
- end
224
+ __init_params__ **kargs
225
+ __init_sub_jobs__
226
+ __init_sources__
227
+ __init_targets__
228
+ __init_transforms__
229
+ end
61
230
 
62
- def transforms
63
- @transforms || {}
64
- end
231
+ # @return [String] the working directory used for temporary data
232
+ attr_reader :work_dir
65
233
 
234
+ # @return [Object] the logging object
235
+ attr_reader :logger
66
236
 
67
- def work_dir
68
- Settings.work_dir
69
- end
237
+ # @return [Job::Parameters] parameters defined at the class level or during instantiation
238
+ attr_reader :params
70
239
 
71
- def self.extended(receiver)
72
- end
240
+ # @return [Array] list of sub_jobs defined in the job
241
+ attr_reader :sub_jobs
73
242
 
74
- def included(receiver)
75
- receiver.extend(JobClassMethods)
76
- receiver.params = self.params.merge(receiver.params)
77
- receiver.sources = self.sources + receiver.sources
78
- receiver.targets = self.targets + receiver.targets
79
- receiver.transforms = self.transforms.merge(receiver.transforms)
80
- end
81
- end
243
+ # @return [Array] list of sources defined in the job
244
+ attr_reader :sources
82
245
 
83
- def self.included(receiver)
84
- receiver.extend(JobClassMethods)
85
- end
246
+ # @return [Array] list of targets defined in the job
247
+ attr_reader :targets
86
248
 
249
+ # @return [Array] list of transforms defined in the job
250
+ attr_reader :transforms
87
251
 
88
- def params
89
- self.class.params
90
- end
91
252
 
92
- def sources
93
- self.class.sources
253
+ # Creates a temporary working directory for the job
254
+ def create_work_dir
255
+ @logger.info "Creating working directory #{work_dir}"
256
+ FileUtils.mkdir_p work_dir
94
257
  end
95
258
 
96
- def targets
97
- self.class.targets
259
+
260
+ # @return [self] the job object (needed to reference parent job in transform DSL)
261
+ def job
262
+ self
98
263
  end
99
264
 
100
- def transforms
101
- self.class.transforms
265
+ def to_s
266
+ inspect
102
267
  end
103
268
 
269
+ def inspect
270
+ "#<#{Remi::Job}>: #{self.class}\n" +
271
+ " parameters: #{params.to_h.keys}\n" +
272
+ " sources: #{sources}\n" +
273
+ " targets: #{targets}\n" +
274
+ " transforms: #{transforms}"
275
+ end
104
276
 
105
277
 
106
- def initialize(runtime_params: {}, delete_work_dir: true, logger: Settings.logger)
107
- @runtime_params = runtime_params
108
- @delete_work_dir = delete_work_dir
109
- @logger = logger
110
- create_work_dir
278
+ # Execute the specified components of the job.
279
+ #
280
+ # @param components [Array<symbol>] list of components to execute (e.g., `:transforms`, `:load_targets`)
281
+ #
282
+ # @return [self]
283
+ def execute(*components)
284
+ execute_transforms if components.empty? || components.include?(:transforms)
285
+ execute_load_targets if components.empty? || components.include?(:load_targets)
286
+ self
111
287
  end
112
288
 
113
- attr_accessor :runtime_params
289
+ private
114
290
 
115
- def work_dir
116
- self.class.work_dir
291
+ def __init_params__(**kargs)
292
+ @params = self.class.params.clone
293
+ add_params **kargs
294
+ params.context = self
117
295
  end
118
296
 
119
- def finalize
120
- delete_work_dir
297
+ def __init_sub_jobs__
298
+ @sub_jobs = self.class.sub_jobs
299
+ @sub_jobs.each do |sub_job|
300
+ send("__init_#{sub_job}__".to_sym)
301
+ end
121
302
  end
122
303
 
123
- def delete_work_dir
124
- if @delete_work_dir && (work_dir.match /^#{Dir.tmpdir}/)
125
- @logger.info "Deleting temporary directory #{work_dir}"
126
- FileUtils.rm_r work_dir
127
- else
128
- @logger.debug "Not going to delete working directory #{work_dir}"
129
- nil
304
+ def __init_sources__
305
+ @sources = self.class.sources
306
+ @sources.each do |source|
307
+ send("__init_#{source}__".to_sym)
130
308
  end
131
309
  end
132
310
 
133
- def create_work_dir
134
- @logger.info "Creating working directory #{work_dir}"
135
- FileUtils.mkdir_p work_dir
311
+ def __init_targets__
312
+ @targets = self.class.targets
313
+ @targets.each do |target|
314
+ send("__init_#{target}__".to_sym)
315
+ end
136
316
  end
137
317
 
138
- # Public: Runs any transforms that use the sources and targets selected. If
139
- # source and target is not specified, then all transforms will be run.
140
- # If only the source is specified, then all transforms that use any of the
141
- # sources will be run. Same for specified transforms.
142
- #
143
- # sources - Array of source names
144
- # targets - Array of target names
145
- #
146
- # Returns an array containing the result of each transform.
147
- def run_transforms_using(sources: nil, targets: nil)
148
- transforms.map do |t, st|
149
- selected_sources = (st[:sources] & Array(sources || st[:sources])).size > 0
150
- selected_targets = (st[:targets] & Array(targets || st[:targets])).size > 0
151
- self.send(t) if selected_sources && selected_targets
318
+ def __init_transforms__
319
+ @transforms = self.class.transforms
320
+ @transforms.each do |transform|
321
+ send("__init_#{transform}__".to_sym)
152
322
  end
153
323
  end
154
324
 
155
- def run_all_transforms
156
- transforms.map { |t, st| self.send(t) }
325
+ # Executes all transforms defined
326
+ def execute_transforms
327
+ transforms.map { |t| send(t).execute }
328
+ self
157
329
  end
158
330
 
159
- def load_all_targets
160
- targets.each do |target|
161
- @logger.info "Loading target #{target}"
162
- self.send(target).tap { |t| t.respond_to?(:load) ? t.load : nil }
163
- end
331
+ # Loads all targets defined
332
+ def execute_load_targets
333
+ targets.each { |t| send(t).load }
334
+ self
164
335
  end
165
336
 
166
- # Public: Runs all transforms defined in the job.
167
- #
168
- # Returns the job instance.
169
- def run
170
- # Do all of the stuff here
171
- run_all_transforms
172
- load_all_targets
173
- self
337
+ # Adds all parameters listed to the job parameters
338
+ def add_params(**kargs)
339
+ kargs.each { |k,v| params[k] = v }
174
340
  end
175
341
  end
176
342
  end