remi 0.2.42 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.yardopts +7 -0
- data/Gemfile +1 -1
- data/Gemfile.lock +13 -26
- data/README.md +1 -1
- data/features/step_definitions/remi_step.rb +33 -13
- data/features/sub_job_example.feature +24 -0
- data/features/sub_transform_example.feature +35 -0
- data/features/sub_transform_many_to_many.feature +49 -0
- data/features/support/env_app.rb +1 -1
- data/jobs/all_jobs_shared.rb +19 -16
- data/jobs/copy_source_job.rb +11 -9
- data/jobs/csv_file_target_job.rb +10 -9
- data/jobs/json_job.rb +18 -14
- data/jobs/metadata_job.rb +33 -28
- data/jobs/parameters_job.rb +14 -11
- data/jobs/sample_job.rb +106 -77
- data/jobs/sftp_file_target_job.rb +14 -13
- data/jobs/sub_job_example_job.rb +86 -0
- data/jobs/sub_transform_example_job.rb +43 -0
- data/jobs/sub_transform_many_to_many_job.rb +46 -0
- data/jobs/transforms/concatenate_job.rb +16 -12
- data/jobs/transforms/data_frame_sieve_job.rb +24 -19
- data/jobs/transforms/date_diff_job.rb +15 -11
- data/jobs/transforms/nvl_job.rb +16 -12
- data/jobs/transforms/parse_date_job.rb +17 -14
- data/jobs/transforms/partitioner_job.rb +27 -19
- data/jobs/transforms/prefix_job.rb +13 -10
- data/jobs/transforms/truncate_job.rb +14 -10
- data/jobs/transforms/truthy_job.rb +11 -8
- data/lib/remi.rb +25 -11
- data/lib/remi/data_frame.rb +4 -4
- data/lib/remi/data_frame/daru.rb +1 -37
- data/lib/remi/data_subject.rb +234 -48
- data/lib/remi/data_subjects/csv_file.rb +171 -0
- data/lib/remi/data_subjects/data_frame.rb +106 -0
- data/lib/remi/data_subjects/file_system.rb +115 -0
- data/lib/remi/data_subjects/local_file.rb +109 -0
- data/lib/remi/data_subjects/none.rb +31 -0
- data/lib/remi/data_subjects/postgres.rb +186 -0
- data/lib/remi/data_subjects/s3_file.rb +84 -0
- data/lib/remi/data_subjects/salesforce.rb +211 -0
- data/lib/remi/data_subjects/sftp_file.rb +196 -0
- data/lib/remi/data_subjects/sub_job.rb +50 -0
- data/lib/remi/dsl.rb +74 -0
- data/lib/remi/encoder.rb +45 -0
- data/lib/remi/extractor.rb +21 -0
- data/lib/remi/field_symbolizers.rb +1 -0
- data/lib/remi/job.rb +279 -113
- data/lib/remi/job/parameters.rb +90 -0
- data/lib/remi/job/sub_job.rb +35 -0
- data/lib/remi/job/transform.rb +165 -0
- data/lib/remi/loader.rb +22 -0
- data/lib/remi/monkeys/daru.rb +4 -0
- data/lib/remi/parser.rb +44 -0
- data/lib/remi/testing/business_rules.rb +17 -23
- data/lib/remi/testing/data_stub.rb +2 -2
- data/lib/remi/version.rb +1 -1
- data/remi.gemspec +3 -0
- data/spec/data_subject_spec.rb +475 -11
- data/spec/data_subjects/csv_file_spec.rb +69 -0
- data/spec/data_subjects/data_frame_spec.rb +52 -0
- data/spec/{extractor → data_subjects}/file_system_spec.rb +0 -0
- data/spec/{extractor → data_subjects}/local_file_spec.rb +0 -0
- data/spec/data_subjects/none_spec.rb +41 -0
- data/spec/data_subjects/postgres_spec.rb +80 -0
- data/spec/{extractor → data_subjects}/s3_file_spec.rb +0 -0
- data/spec/data_subjects/salesforce_spec.rb +117 -0
- data/spec/{extractor → data_subjects}/sftp_file_spec.rb +16 -0
- data/spec/data_subjects/sub_job_spec.rb +33 -0
- data/spec/encoder_spec.rb +38 -0
- data/spec/extractor_spec.rb +11 -0
- data/spec/fixtures/sf_bulk_helper_stubs.rb +443 -0
- data/spec/job/transform_spec.rb +257 -0
- data/spec/job_spec.rb +507 -0
- data/spec/loader_spec.rb +11 -0
- data/spec/parser_spec.rb +38 -0
- data/spec/sf_bulk_helper_spec.rb +117 -0
- data/spec/testing/data_stub_spec.rb +5 -3
- metadata +109 -27
- data/features/aggregate.feature +0 -42
- data/jobs/aggregate_job.rb +0 -31
- data/jobs/transforms/transform_jobs.rb +0 -4
- data/lib/remi/data_subject/csv_file.rb +0 -162
- data/lib/remi/data_subject/data_frame.rb +0 -52
- data/lib/remi/data_subject/postgres.rb +0 -134
- data/lib/remi/data_subject/salesforce.rb +0 -136
- data/lib/remi/data_subject/sftp_file.rb +0 -65
- data/lib/remi/extractor/file_system.rb +0 -92
- data/lib/remi/extractor/local_file.rb +0 -43
- data/lib/remi/extractor/s3_file.rb +0 -57
- data/lib/remi/extractor/sftp_file.rb +0 -83
- data/spec/data_subject/csv_file_spec.rb +0 -79
- data/spec/data_subject/data_frame.rb +0 -27
@@ -0,0 +1,43 @@
|
|
1
|
+
require_relative 'all_jobs_shared'
|
2
|
+
|
3
|
+
class SharedTransforms < Remi::Job
|
4
|
+
sub_transform :id_prefixer, prefix: 'DEFAULT' do
|
5
|
+
# Declare the required data subjects and data subject fields
|
6
|
+
source :st_source, [:st_source_id]
|
7
|
+
target :st_target, [:st_prefixed_id]
|
8
|
+
|
9
|
+
# Do anything that is allowed in a transform with the provided data subjects and fields
|
10
|
+
Remi::SourceToTargetMap.apply(st_source.df, st_target.df) do
|
11
|
+
map source(:st_source_id) .target(:st_prefixed_id)
|
12
|
+
.transform(Remi::Transform::Prefix.new(params[:prefix]))
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
class SubTransformExampleJob < Remi::Job
|
18
|
+
param(:job_prefix) { nil }
|
19
|
+
|
20
|
+
source :my_source do
|
21
|
+
fields({ :id => {} })
|
22
|
+
end
|
23
|
+
|
24
|
+
target :my_target do
|
25
|
+
fields(
|
26
|
+
{
|
27
|
+
:id => {},
|
28
|
+
:default_id => {}
|
29
|
+
}
|
30
|
+
)
|
31
|
+
end
|
32
|
+
|
33
|
+
transform :main do
|
34
|
+
my_target.df = my_source.df.dup
|
35
|
+
sub_trans_params = job.params[:job_prefix].nil? ? {} : { prefix: job.params[:job_prefix] }
|
36
|
+
|
37
|
+
import SharedTransforms.new.id_prefixer, sub_trans_params do
|
38
|
+
map_source_fields :my_source, :st_source, { :id => :st_source_id }
|
39
|
+
map_target_fields :st_target, :my_target, { :st_prefixed_id => :default_id }
|
40
|
+
end
|
41
|
+
|
42
|
+
end
|
43
|
+
end
|
@@ -0,0 +1,46 @@
|
|
1
|
+
require_relative 'all_jobs_shared'
|
2
|
+
|
3
|
+
class SharedManyToManyTransforms < Remi::Job
|
4
|
+
sub_transform :unique_values do
|
5
|
+
source :fact, [:id]
|
6
|
+
source :dimension, [:id, :beer, :style]
|
7
|
+
target :unique_beers, [:beer, :count]
|
8
|
+
target :unique_styles, [:style, :count]
|
9
|
+
|
10
|
+
flat_df = fact.df.join(dimension.df, on: [:id], how: :left)
|
11
|
+
|
12
|
+
unique_beers.df = flat_df.group_by([:beer]).size.detach_index
|
13
|
+
unique_beers.df.rename_vectors({ :index => :beer, :values => :count })
|
14
|
+
|
15
|
+
unique_styles.df = flat_df.group_by([:style]).size.detach_index
|
16
|
+
unique_styles.df.rename_vectors({ :index => :style, :values => :count })
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
class SubTransformManyToManyJob < Remi::Job
|
21
|
+
source :beer_fact do
|
22
|
+
fields({ :fact_sk => {}, :beer_sk => {} })
|
23
|
+
end
|
24
|
+
|
25
|
+
source :beer_dim do
|
26
|
+
fields({ :beer_sk => {}, :name => {}, :style => {} })
|
27
|
+
end
|
28
|
+
|
29
|
+
target :beer_count do
|
30
|
+
fields({ :name => {}, :count => {} })
|
31
|
+
end
|
32
|
+
|
33
|
+
target :style_count do
|
34
|
+
fields({ :style => {}, :count => {} })
|
35
|
+
end
|
36
|
+
|
37
|
+
transform :main do
|
38
|
+
import SharedManyToManyTransforms.new.unique_values do
|
39
|
+
map_source_fields :beer_fact, :fact, { :beer_sk => :id }
|
40
|
+
map_source_fields :beer_dim, :dimension, { :beer_sk => :id, :name => :beer, :style => :style }
|
41
|
+
|
42
|
+
map_target_fields :unique_beers, :beer_count, { :beer => :name, :count => :count }
|
43
|
+
map_target_fields :unique_styles, :style_count, { :style => :style, :count => :count }
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
@@ -1,21 +1,25 @@
|
|
1
1
|
require_relative '../all_jobs_shared'
|
2
2
|
|
3
|
-
class ConcatenateJob
|
4
|
-
include AllJobsShared
|
3
|
+
class ConcatenateJob < Remi::Job
|
5
4
|
|
6
|
-
|
7
|
-
define_source :source_data, Remi::DataSource::DataFrame,
|
8
|
-
fields: {
|
9
|
-
:field1 => {},
|
10
|
-
:field2 => {},
|
11
|
-
:field3 => {}
|
12
|
-
}
|
13
|
-
define_target :target_data, Remi::DataTarget::DataFrame
|
5
|
+
param(:delimiter) { ',' }
|
14
6
|
|
15
|
-
|
7
|
+
source :source_data do
|
8
|
+
fields(
|
9
|
+
{
|
10
|
+
:field1 => {},
|
11
|
+
:field2 => {},
|
12
|
+
:field3 => {}
|
13
|
+
}
|
14
|
+
)
|
15
|
+
end
|
16
|
+
|
17
|
+
target :target_data
|
18
|
+
|
19
|
+
transform :main do
|
16
20
|
Remi::SourceToTargetMap.apply(source_data.df, target_data.df) do
|
17
21
|
map source(:field1, :field2, :field3) .target(:result_field)
|
18
|
-
.transform(Remi::Transform::Concatenate.new(params[:delimiter]))
|
22
|
+
.transform(Remi::Transform::Concatenate.new(job.params[:delimiter]))
|
19
23
|
end
|
20
24
|
end
|
21
25
|
end
|
@@ -1,31 +1,36 @@
|
|
1
1
|
require_relative '../all_jobs_shared'
|
2
2
|
|
3
|
-
class DataFrameSieveJob
|
4
|
-
include AllJobsShared
|
3
|
+
class DataFrameSieveJob < Remi::Job
|
5
4
|
|
6
|
-
|
7
|
-
fields
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
:level => {},
|
17
|
-
:program => {},
|
18
|
-
:contact => {},
|
19
|
-
:group => {}
|
20
|
-
}
|
5
|
+
source :source_data do
|
6
|
+
fields(
|
7
|
+
{
|
8
|
+
:id => {},
|
9
|
+
:level => {},
|
10
|
+
:program => {},
|
11
|
+
:contact => {}
|
12
|
+
}
|
13
|
+
)
|
14
|
+
end
|
21
15
|
|
22
|
-
|
16
|
+
source :sieve do
|
17
|
+
fields(
|
18
|
+
{
|
19
|
+
:level => {},
|
20
|
+
:program => {},
|
21
|
+
:contact => {},
|
22
|
+
:group => {}
|
23
|
+
}
|
24
|
+
)
|
25
|
+
end
|
23
26
|
|
24
|
-
|
27
|
+
target :target_data
|
25
28
|
|
29
|
+
transform :main do
|
26
30
|
# Hack to convert example to regex
|
27
31
|
sieve.df[:program].recode! { |v| (v || '').match(/\A\/.*\/\Z/) ? /#{v[1...-1]}/ : v }
|
28
32
|
|
33
|
+
target_data.df = source_data.df.dup
|
29
34
|
Remi::SourceToTargetMap.apply(source_data.df, target_data.df) do
|
30
35
|
map source(:level, :program, :contact) .target(:group)
|
31
36
|
.transform(Remi::Transform::DataFrameSieve.new(sieve.df))
|
@@ -1,24 +1,28 @@
|
|
1
1
|
require_relative '../all_jobs_shared'
|
2
2
|
|
3
|
-
class DateDiffJob
|
4
|
-
include AllJobsShared
|
3
|
+
class DateDiffJob < Remi::Job
|
5
4
|
|
6
|
-
|
7
|
-
define_source :source_data, Remi::DataSource::DataFrame,
|
8
|
-
fields: {
|
9
|
-
:date1 => { type: :date, format: '%Y-%m-%d' },
|
10
|
-
:date2 => { type: :date, format: '%Y-%m-%d' }
|
11
|
-
}
|
12
|
-
define_target :target_data, Remi::DataTarget::DataFrame
|
5
|
+
param(:measure) { :days }
|
13
6
|
|
14
|
-
|
7
|
+
source :source_data do
|
8
|
+
fields(
|
9
|
+
{
|
10
|
+
:date1 => { type: :date, format: '%Y-%m-%d' },
|
11
|
+
:date2 => { type: :date, format: '%Y-%m-%d' }
|
12
|
+
}
|
13
|
+
)
|
14
|
+
end
|
15
|
+
|
16
|
+
target :target_data
|
17
|
+
|
18
|
+
transform :main do
|
15
19
|
Remi::SourceToTargetMap.apply(source_data.df, target_data.df) do
|
16
20
|
map source(:date1, :date2) .target(:difference)
|
17
21
|
.transform(->(row) {
|
18
22
|
row[:date1] = Date.strptime(row[:date1])
|
19
23
|
row[:date2] = Date.strptime(row[:date2])
|
20
24
|
})
|
21
|
-
.transform(Remi::Transform::DateDiff.new(params[:measure]))
|
25
|
+
.transform(Remi::Transform::DateDiff.new(job.params[:measure]))
|
22
26
|
end
|
23
27
|
end
|
24
28
|
end
|
data/jobs/transforms/nvl_job.rb
CHANGED
@@ -1,21 +1,25 @@
|
|
1
1
|
require_relative '../all_jobs_shared'
|
2
2
|
|
3
|
-
class NvlJob
|
4
|
-
include AllJobsShared
|
3
|
+
class NvlJob < Remi::Job
|
5
4
|
|
6
|
-
|
7
|
-
define_source :source_data, Remi::DataSource::DataFrame,
|
8
|
-
fields: {
|
9
|
-
:field1 => {},
|
10
|
-
:field2 => {},
|
11
|
-
:field3 => {}
|
12
|
-
}
|
13
|
-
define_target :target_data, Remi::DataTarget::DataFrame
|
5
|
+
param(:default) { '' }
|
14
6
|
|
15
|
-
|
7
|
+
source :source_data do
|
8
|
+
fields(
|
9
|
+
{
|
10
|
+
:field1 => {},
|
11
|
+
:field2 => {},
|
12
|
+
:field3 => {}
|
13
|
+
}
|
14
|
+
)
|
15
|
+
end
|
16
|
+
|
17
|
+
target :target_data
|
18
|
+
|
19
|
+
transform :main do
|
16
20
|
Remi::SourceToTargetMap.apply(source_data.df, target_data.df) do
|
17
21
|
map source(:field1, :field2, :field3) .target(:result_field)
|
18
|
-
.transform(Remi::Transform::Nvl.new(params[:default]))
|
22
|
+
.transform(Remi::Transform::Nvl.new(job.params[:default]))
|
19
23
|
end
|
20
24
|
end
|
21
25
|
end
|
@@ -1,28 +1,31 @@
|
|
1
1
|
require_relative '../all_jobs_shared'
|
2
2
|
|
3
|
-
class ParseDateJob
|
4
|
-
include AllJobsShared
|
3
|
+
class ParseDateJob < Remi::Job
|
5
4
|
|
6
|
-
|
7
|
-
|
8
|
-
define_source :source_data, Remi::DataSource::DataFrame,
|
9
|
-
fields: {
|
10
|
-
:date_string => { type: :date, in_format: params[:format] },
|
11
|
-
:stubbed_date => { type: :date, in_format: params[:format] }
|
12
|
-
}
|
13
|
-
define_target :target_data, Remi::DataTarget::DataFrame
|
5
|
+
param(:format) { '%Y-%m-%d' }
|
6
|
+
param(:if_blank) { nil }
|
14
7
|
|
15
|
-
|
8
|
+
source :source_data do
|
9
|
+
fields(
|
10
|
+
{
|
11
|
+
:date_string => { type: :date, in_format: params[:format] },
|
12
|
+
:stubbed_date => { type: :date, in_format: params[:format] }
|
13
|
+
}
|
14
|
+
)
|
15
|
+
end
|
16
|
+
|
17
|
+
target :target_data
|
16
18
|
|
19
|
+
transform :main do
|
17
20
|
# Only needed for testing, would be nice to make it testable without this
|
18
|
-
params[:if_blank] = ['high', 'low'].include?(params[:if_blank]) ? params[:if_blank].to_sym : params[:if_blank]
|
21
|
+
job.params[:if_blank] = ['high', 'low'].include?(job.params[:if_blank]) ? job.params[:if_blank].to_sym : job.params[:if_blank]
|
19
22
|
|
20
23
|
Remi::SourceToTargetMap.apply(source_data.df, target_data.df) do
|
21
24
|
map source(:date_string) .target(:parsed_date)
|
22
|
-
.transform(Remi::Transform::ParseDate.new(in_format: params[:format], if_blank: params[:if_blank]))
|
25
|
+
.transform(Remi::Transform::ParseDate.new(in_format: job.params[:format], if_blank: job.params[:if_blank]))
|
23
26
|
|
24
27
|
map source(:stubbed_date) .target(:parsed_stubbed_date)
|
25
|
-
.transform(Remi::Transform::ParseDate.new(in_format: source_data.fields[:stubbed_date][:in_format], if_blank: params[:if_blank]))
|
28
|
+
.transform(Remi::Transform::ParseDate.new(in_format: source_data.fields[:stubbed_date][:in_format], if_blank: job.params[:if_blank]))
|
26
29
|
end
|
27
30
|
end
|
28
31
|
end
|
@@ -1,28 +1,36 @@
|
|
1
1
|
require_relative '../all_jobs_shared'
|
2
2
|
|
3
|
-
class PartitionerJob
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
3
|
+
class PartitionerJob < Remi::Job
|
4
|
+
|
5
|
+
source :source_data do
|
6
|
+
fields(
|
7
|
+
{
|
8
|
+
:id => {}
|
9
|
+
}
|
10
|
+
)
|
11
|
+
end
|
10
12
|
|
11
|
-
|
12
|
-
fields
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
source :distribution do
|
14
|
+
fields(
|
15
|
+
{
|
16
|
+
:group => {},
|
17
|
+
:weight => {}
|
18
|
+
}
|
19
|
+
)
|
20
|
+
end
|
16
21
|
|
17
|
-
|
18
|
-
fields
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
+
source :current_population do
|
23
|
+
fields(
|
24
|
+
{
|
25
|
+
:group => {},
|
26
|
+
:count => {}
|
27
|
+
}
|
28
|
+
)
|
29
|
+
end
|
22
30
|
|
23
|
-
|
31
|
+
target :target_data
|
24
32
|
|
25
|
-
|
33
|
+
transform :main do
|
26
34
|
|
27
35
|
distribution_hash = distribution.df.map(:row) { |row| [row[:group], row[:weight].to_f] }.to_h
|
28
36
|
current_population_hash = current_population.df.map(:row) { |row| [row[:group], row[:count].to_i] }.to_h
|
@@ -1,19 +1,22 @@
|
|
1
1
|
require_relative '../all_jobs_shared'
|
2
2
|
|
3
|
-
class PrefixJob
|
4
|
-
include AllJobsShared
|
3
|
+
class PrefixJob < Remi::Job
|
5
4
|
|
6
|
-
|
7
|
-
|
8
|
-
fields
|
9
|
-
|
10
|
-
|
11
|
-
|
5
|
+
param(:prefix) { 'prefix' }
|
6
|
+
source :source_data do
|
7
|
+
fields(
|
8
|
+
{
|
9
|
+
:my_field => {}
|
10
|
+
}
|
11
|
+
)
|
12
|
+
end
|
13
|
+
|
14
|
+
target :target_data
|
12
15
|
|
13
|
-
|
16
|
+
transform :main do
|
14
17
|
Remi::SourceToTargetMap.apply(source_data.df, target_data.df) do
|
15
18
|
map source(:my_field) .target(:prefixed_field)
|
16
|
-
.transform(Remi::Transform::Prefix.new(params[:prefix]))
|
19
|
+
.transform(Remi::Transform::Prefix.new(job.params[:prefix]))
|
17
20
|
end
|
18
21
|
end
|
19
22
|
end
|
@@ -1,19 +1,23 @@
|
|
1
1
|
require_relative '../all_jobs_shared'
|
2
2
|
|
3
|
-
class TruncateJob
|
4
|
-
include AllJobsShared
|
3
|
+
class TruncateJob < Remi::Job
|
5
4
|
|
6
|
-
|
7
|
-
define_source :source_data, Remi::DataSource::DataFrame,
|
8
|
-
fields: {
|
9
|
-
:my_field => {}
|
10
|
-
}
|
11
|
-
define_target :target_data, Remi::DataTarget::DataFrame
|
5
|
+
param(:truncate_len) { 5 }
|
12
6
|
|
13
|
-
|
7
|
+
source :source_data do
|
8
|
+
fields(
|
9
|
+
{
|
10
|
+
:my_field => {}
|
11
|
+
}
|
12
|
+
)
|
13
|
+
end
|
14
|
+
|
15
|
+
target :target_data
|
16
|
+
|
17
|
+
transform :main do
|
14
18
|
Remi::SourceToTargetMap.apply(source_data.df, target_data.df) do
|
15
19
|
map source(:my_field) .target(:truncated_field)
|
16
|
-
.transform(Remi::Transform::Truncate.new(params[:truncate_len].to_i))
|
20
|
+
.transform(Remi::Transform::Truncate.new(job.params[:truncate_len].to_i))
|
17
21
|
end
|
18
22
|
end
|
19
23
|
end
|
@@ -1,15 +1,18 @@
|
|
1
1
|
require_relative '../all_jobs_shared'
|
2
2
|
|
3
|
-
class TruthyJob
|
4
|
-
include AllJobsShared
|
3
|
+
class TruthyJob < Remi::Job
|
5
4
|
|
6
|
-
|
7
|
-
fields
|
8
|
-
|
9
|
-
|
10
|
-
|
5
|
+
source :source_data do
|
6
|
+
fields(
|
7
|
+
{
|
8
|
+
:truthy => {}
|
9
|
+
}
|
10
|
+
)
|
11
|
+
end
|
12
|
+
|
13
|
+
target :target_data
|
11
14
|
|
12
|
-
|
15
|
+
transform :main do
|
13
16
|
Remi::SourceToTargetMap.apply(source_data.df, target_data.df) do
|
14
17
|
map source(:truthy) .target(:allow_nils)
|
15
18
|
.transform(Remi::Transform::Truthy.new(allow_nils: true))
|