remi 0.2.42 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (94) hide show
  1. checksums.yaml +4 -4
  2. data/.yardopts +7 -0
  3. data/Gemfile +1 -1
  4. data/Gemfile.lock +13 -26
  5. data/README.md +1 -1
  6. data/features/step_definitions/remi_step.rb +33 -13
  7. data/features/sub_job_example.feature +24 -0
  8. data/features/sub_transform_example.feature +35 -0
  9. data/features/sub_transform_many_to_many.feature +49 -0
  10. data/features/support/env_app.rb +1 -1
  11. data/jobs/all_jobs_shared.rb +19 -16
  12. data/jobs/copy_source_job.rb +11 -9
  13. data/jobs/csv_file_target_job.rb +10 -9
  14. data/jobs/json_job.rb +18 -14
  15. data/jobs/metadata_job.rb +33 -28
  16. data/jobs/parameters_job.rb +14 -11
  17. data/jobs/sample_job.rb +106 -77
  18. data/jobs/sftp_file_target_job.rb +14 -13
  19. data/jobs/sub_job_example_job.rb +86 -0
  20. data/jobs/sub_transform_example_job.rb +43 -0
  21. data/jobs/sub_transform_many_to_many_job.rb +46 -0
  22. data/jobs/transforms/concatenate_job.rb +16 -12
  23. data/jobs/transforms/data_frame_sieve_job.rb +24 -19
  24. data/jobs/transforms/date_diff_job.rb +15 -11
  25. data/jobs/transforms/nvl_job.rb +16 -12
  26. data/jobs/transforms/parse_date_job.rb +17 -14
  27. data/jobs/transforms/partitioner_job.rb +27 -19
  28. data/jobs/transforms/prefix_job.rb +13 -10
  29. data/jobs/transforms/truncate_job.rb +14 -10
  30. data/jobs/transforms/truthy_job.rb +11 -8
  31. data/lib/remi.rb +25 -11
  32. data/lib/remi/data_frame.rb +4 -4
  33. data/lib/remi/data_frame/daru.rb +1 -37
  34. data/lib/remi/data_subject.rb +234 -48
  35. data/lib/remi/data_subjects/csv_file.rb +171 -0
  36. data/lib/remi/data_subjects/data_frame.rb +106 -0
  37. data/lib/remi/data_subjects/file_system.rb +115 -0
  38. data/lib/remi/data_subjects/local_file.rb +109 -0
  39. data/lib/remi/data_subjects/none.rb +31 -0
  40. data/lib/remi/data_subjects/postgres.rb +186 -0
  41. data/lib/remi/data_subjects/s3_file.rb +84 -0
  42. data/lib/remi/data_subjects/salesforce.rb +211 -0
  43. data/lib/remi/data_subjects/sftp_file.rb +196 -0
  44. data/lib/remi/data_subjects/sub_job.rb +50 -0
  45. data/lib/remi/dsl.rb +74 -0
  46. data/lib/remi/encoder.rb +45 -0
  47. data/lib/remi/extractor.rb +21 -0
  48. data/lib/remi/field_symbolizers.rb +1 -0
  49. data/lib/remi/job.rb +279 -113
  50. data/lib/remi/job/parameters.rb +90 -0
  51. data/lib/remi/job/sub_job.rb +35 -0
  52. data/lib/remi/job/transform.rb +165 -0
  53. data/lib/remi/loader.rb +22 -0
  54. data/lib/remi/monkeys/daru.rb +4 -0
  55. data/lib/remi/parser.rb +44 -0
  56. data/lib/remi/testing/business_rules.rb +17 -23
  57. data/lib/remi/testing/data_stub.rb +2 -2
  58. data/lib/remi/version.rb +1 -1
  59. data/remi.gemspec +3 -0
  60. data/spec/data_subject_spec.rb +475 -11
  61. data/spec/data_subjects/csv_file_spec.rb +69 -0
  62. data/spec/data_subjects/data_frame_spec.rb +52 -0
  63. data/spec/{extractor → data_subjects}/file_system_spec.rb +0 -0
  64. data/spec/{extractor → data_subjects}/local_file_spec.rb +0 -0
  65. data/spec/data_subjects/none_spec.rb +41 -0
  66. data/spec/data_subjects/postgres_spec.rb +80 -0
  67. data/spec/{extractor → data_subjects}/s3_file_spec.rb +0 -0
  68. data/spec/data_subjects/salesforce_spec.rb +117 -0
  69. data/spec/{extractor → data_subjects}/sftp_file_spec.rb +16 -0
  70. data/spec/data_subjects/sub_job_spec.rb +33 -0
  71. data/spec/encoder_spec.rb +38 -0
  72. data/spec/extractor_spec.rb +11 -0
  73. data/spec/fixtures/sf_bulk_helper_stubs.rb +443 -0
  74. data/spec/job/transform_spec.rb +257 -0
  75. data/spec/job_spec.rb +507 -0
  76. data/spec/loader_spec.rb +11 -0
  77. data/spec/parser_spec.rb +38 -0
  78. data/spec/sf_bulk_helper_spec.rb +117 -0
  79. data/spec/testing/data_stub_spec.rb +5 -3
  80. metadata +109 -27
  81. data/features/aggregate.feature +0 -42
  82. data/jobs/aggregate_job.rb +0 -31
  83. data/jobs/transforms/transform_jobs.rb +0 -4
  84. data/lib/remi/data_subject/csv_file.rb +0 -162
  85. data/lib/remi/data_subject/data_frame.rb +0 -52
  86. data/lib/remi/data_subject/postgres.rb +0 -134
  87. data/lib/remi/data_subject/salesforce.rb +0 -136
  88. data/lib/remi/data_subject/sftp_file.rb +0 -65
  89. data/lib/remi/extractor/file_system.rb +0 -92
  90. data/lib/remi/extractor/local_file.rb +0 -43
  91. data/lib/remi/extractor/s3_file.rb +0 -57
  92. data/lib/remi/extractor/sftp_file.rb +0 -83
  93. data/spec/data_subject/csv_file_spec.rb +0 -79
  94. data/spec/data_subject/data_frame.rb +0 -27
@@ -0,0 +1,43 @@
1
+ require_relative 'all_jobs_shared'
2
+
3
+ class SharedTransforms < Remi::Job
4
+ sub_transform :id_prefixer, prefix: 'DEFAULT' do
5
+ # Declare the required data subjects and data subject fields
6
+ source :st_source, [:st_source_id]
7
+ target :st_target, [:st_prefixed_id]
8
+
9
+ # Do anything that is allowed in a transform with the provided data subjects and fields
10
+ Remi::SourceToTargetMap.apply(st_source.df, st_target.df) do
11
+ map source(:st_source_id) .target(:st_prefixed_id)
12
+ .transform(Remi::Transform::Prefix.new(params[:prefix]))
13
+ end
14
+ end
15
+ end
16
+
17
+ class SubTransformExampleJob < Remi::Job
18
+ param(:job_prefix) { nil }
19
+
20
+ source :my_source do
21
+ fields({ :id => {} })
22
+ end
23
+
24
+ target :my_target do
25
+ fields(
26
+ {
27
+ :id => {},
28
+ :default_id => {}
29
+ }
30
+ )
31
+ end
32
+
33
+ transform :main do
34
+ my_target.df = my_source.df.dup
35
+ sub_trans_params = job.params[:job_prefix].nil? ? {} : { prefix: job.params[:job_prefix] }
36
+
37
+ import SharedTransforms.new.id_prefixer, sub_trans_params do
38
+ map_source_fields :my_source, :st_source, { :id => :st_source_id }
39
+ map_target_fields :st_target, :my_target, { :st_prefixed_id => :default_id }
40
+ end
41
+
42
+ end
43
+ end
@@ -0,0 +1,46 @@
1
+ require_relative 'all_jobs_shared'
2
+
3
+ class SharedManyToManyTransforms < Remi::Job
4
+ sub_transform :unique_values do
5
+ source :fact, [:id]
6
+ source :dimension, [:id, :beer, :style]
7
+ target :unique_beers, [:beer, :count]
8
+ target :unique_styles, [:style, :count]
9
+
10
+ flat_df = fact.df.join(dimension.df, on: [:id], how: :left)
11
+
12
+ unique_beers.df = flat_df.group_by([:beer]).size.detach_index
13
+ unique_beers.df.rename_vectors({ :index => :beer, :values => :count })
14
+
15
+ unique_styles.df = flat_df.group_by([:style]).size.detach_index
16
+ unique_styles.df.rename_vectors({ :index => :style, :values => :count })
17
+ end
18
+ end
19
+
20
+ class SubTransformManyToManyJob < Remi::Job
21
+ source :beer_fact do
22
+ fields({ :fact_sk => {}, :beer_sk => {} })
23
+ end
24
+
25
+ source :beer_dim do
26
+ fields({ :beer_sk => {}, :name => {}, :style => {} })
27
+ end
28
+
29
+ target :beer_count do
30
+ fields({ :name => {}, :count => {} })
31
+ end
32
+
33
+ target :style_count do
34
+ fields({ :style => {}, :count => {} })
35
+ end
36
+
37
+ transform :main do
38
+ import SharedManyToManyTransforms.new.unique_values do
39
+ map_source_fields :beer_fact, :fact, { :beer_sk => :id }
40
+ map_source_fields :beer_dim, :dimension, { :beer_sk => :id, :name => :beer, :style => :style }
41
+
42
+ map_target_fields :unique_beers, :beer_count, { :beer => :name, :count => :count }
43
+ map_target_fields :unique_styles, :style_count, { :style => :style, :count => :count }
44
+ end
45
+ end
46
+ end
@@ -1,21 +1,25 @@
1
1
  require_relative '../all_jobs_shared'
2
2
 
3
- class ConcatenateJob
4
- include AllJobsShared
3
+ class ConcatenateJob < Remi::Job
5
4
 
6
- define_param :delimiter, ','
7
- define_source :source_data, Remi::DataSource::DataFrame,
8
- fields: {
9
- :field1 => {},
10
- :field2 => {},
11
- :field3 => {}
12
- }
13
- define_target :target_data, Remi::DataTarget::DataFrame
5
+ param(:delimiter) { ',' }
14
6
 
15
- define_transform :main, sources: :source_data, targets: :target_data do
7
+ source :source_data do
8
+ fields(
9
+ {
10
+ :field1 => {},
11
+ :field2 => {},
12
+ :field3 => {}
13
+ }
14
+ )
15
+ end
16
+
17
+ target :target_data
18
+
19
+ transform :main do
16
20
  Remi::SourceToTargetMap.apply(source_data.df, target_data.df) do
17
21
  map source(:field1, :field2, :field3) .target(:result_field)
18
- .transform(Remi::Transform::Concatenate.new(params[:delimiter]))
22
+ .transform(Remi::Transform::Concatenate.new(job.params[:delimiter]))
19
23
  end
20
24
  end
21
25
  end
@@ -1,31 +1,36 @@
1
1
  require_relative '../all_jobs_shared'
2
2
 
3
- class DataFrameSieveJob
4
- include AllJobsShared
3
+ class DataFrameSieveJob < Remi::Job
5
4
 
6
- define_source :source_data, Remi::DataSource::DataFrame,
7
- fields: {
8
- :id => {},
9
- :level => {},
10
- :program => {},
11
- :contact => {}
12
- }
13
-
14
- define_source :sieve, Remi::DataSource::DataFrame,
15
- fields: {
16
- :level => {},
17
- :program => {},
18
- :contact => {},
19
- :group => {}
20
- }
5
+ source :source_data do
6
+ fields(
7
+ {
8
+ :id => {},
9
+ :level => {},
10
+ :program => {},
11
+ :contact => {}
12
+ }
13
+ )
14
+ end
21
15
 
22
- define_target :target_data, Remi::DataTarget::DataFrame
16
+ source :sieve do
17
+ fields(
18
+ {
19
+ :level => {},
20
+ :program => {},
21
+ :contact => {},
22
+ :group => {}
23
+ }
24
+ )
25
+ end
23
26
 
24
- define_transform :main, sources: :source_data, targets: :target_data do
27
+ target :target_data
25
28
 
29
+ transform :main do
26
30
  # Hack to convert example to regex
27
31
  sieve.df[:program].recode! { |v| (v || '').match(/\A\/.*\/\Z/) ? /#{v[1...-1]}/ : v }
28
32
 
33
+ target_data.df = source_data.df.dup
29
34
  Remi::SourceToTargetMap.apply(source_data.df, target_data.df) do
30
35
  map source(:level, :program, :contact) .target(:group)
31
36
  .transform(Remi::Transform::DataFrameSieve.new(sieve.df))
@@ -1,24 +1,28 @@
1
1
  require_relative '../all_jobs_shared'
2
2
 
3
- class DateDiffJob
4
- include AllJobsShared
3
+ class DateDiffJob < Remi::Job
5
4
 
6
- define_param :measure, :days
7
- define_source :source_data, Remi::DataSource::DataFrame,
8
- fields: {
9
- :date1 => { type: :date, format: '%Y-%m-%d' },
10
- :date2 => { type: :date, format: '%Y-%m-%d' }
11
- }
12
- define_target :target_data, Remi::DataTarget::DataFrame
5
+ param(:measure) { :days }
13
6
 
14
- define_transform :main, sources: :source_data, targets: :target_data do
7
+ source :source_data do
8
+ fields(
9
+ {
10
+ :date1 => { type: :date, format: '%Y-%m-%d' },
11
+ :date2 => { type: :date, format: '%Y-%m-%d' }
12
+ }
13
+ )
14
+ end
15
+
16
+ target :target_data
17
+
18
+ transform :main do
15
19
  Remi::SourceToTargetMap.apply(source_data.df, target_data.df) do
16
20
  map source(:date1, :date2) .target(:difference)
17
21
  .transform(->(row) {
18
22
  row[:date1] = Date.strptime(row[:date1])
19
23
  row[:date2] = Date.strptime(row[:date2])
20
24
  })
21
- .transform(Remi::Transform::DateDiff.new(params[:measure]))
25
+ .transform(Remi::Transform::DateDiff.new(job.params[:measure]))
22
26
  end
23
27
  end
24
28
  end
@@ -1,21 +1,25 @@
1
1
  require_relative '../all_jobs_shared'
2
2
 
3
- class NvlJob
4
- include AllJobsShared
3
+ class NvlJob < Remi::Job
5
4
 
6
- define_param :default, ''
7
- define_source :source_data, Remi::DataSource::DataFrame,
8
- fields: {
9
- :field1 => {},
10
- :field2 => {},
11
- :field3 => {}
12
- }
13
- define_target :target_data, Remi::DataTarget::DataFrame
5
+ param(:default) { '' }
14
6
 
15
- define_transform :main, sources: :source_data, targets: :target_data do
7
+ source :source_data do
8
+ fields(
9
+ {
10
+ :field1 => {},
11
+ :field2 => {},
12
+ :field3 => {}
13
+ }
14
+ )
15
+ end
16
+
17
+ target :target_data
18
+
19
+ transform :main do
16
20
  Remi::SourceToTargetMap.apply(source_data.df, target_data.df) do
17
21
  map source(:field1, :field2, :field3) .target(:result_field)
18
- .transform(Remi::Transform::Nvl.new(params[:default]))
22
+ .transform(Remi::Transform::Nvl.new(job.params[:default]))
19
23
  end
20
24
  end
21
25
  end
@@ -1,28 +1,31 @@
1
1
  require_relative '../all_jobs_shared'
2
2
 
3
- class ParseDateJob
4
- include AllJobsShared
3
+ class ParseDateJob < Remi::Job
5
4
 
6
- define_param :format, '%Y-%m-%d'
7
- define_param :if_blank, nil
8
- define_source :source_data, Remi::DataSource::DataFrame,
9
- fields: {
10
- :date_string => { type: :date, in_format: params[:format] },
11
- :stubbed_date => { type: :date, in_format: params[:format] }
12
- }
13
- define_target :target_data, Remi::DataTarget::DataFrame
5
+ param(:format) { '%Y-%m-%d' }
6
+ param(:if_blank) { nil }
14
7
 
15
- define_transform :main, sources: :source_data, targets: :target_data do
8
+ source :source_data do
9
+ fields(
10
+ {
11
+ :date_string => { type: :date, in_format: params[:format] },
12
+ :stubbed_date => { type: :date, in_format: params[:format] }
13
+ }
14
+ )
15
+ end
16
+
17
+ target :target_data
16
18
 
19
+ transform :main do
17
20
  # Only needed for testing, would be nice to make it testable without this
18
- params[:if_blank] = ['high', 'low'].include?(params[:if_blank]) ? params[:if_blank].to_sym : params[:if_blank]
21
+ job.params[:if_blank] = ['high', 'low'].include?(job.params[:if_blank]) ? job.params[:if_blank].to_sym : job.params[:if_blank]
19
22
 
20
23
  Remi::SourceToTargetMap.apply(source_data.df, target_data.df) do
21
24
  map source(:date_string) .target(:parsed_date)
22
- .transform(Remi::Transform::ParseDate.new(in_format: params[:format], if_blank: params[:if_blank]))
25
+ .transform(Remi::Transform::ParseDate.new(in_format: job.params[:format], if_blank: job.params[:if_blank]))
23
26
 
24
27
  map source(:stubbed_date) .target(:parsed_stubbed_date)
25
- .transform(Remi::Transform::ParseDate.new(in_format: source_data.fields[:stubbed_date][:in_format], if_blank: params[:if_blank]))
28
+ .transform(Remi::Transform::ParseDate.new(in_format: source_data.fields[:stubbed_date][:in_format], if_blank: job.params[:if_blank]))
26
29
  end
27
30
  end
28
31
  end
@@ -1,28 +1,36 @@
1
1
  require_relative '../all_jobs_shared'
2
2
 
3
- class PartitionerJob
4
- include AllJobsShared
5
-
6
- define_source :source_data, Remi::DataSource::DataFrame,
7
- fields: {
8
- :id => {}
9
- }
3
+ class PartitionerJob < Remi::Job
4
+
5
+ source :source_data do
6
+ fields(
7
+ {
8
+ :id => {}
9
+ }
10
+ )
11
+ end
10
12
 
11
- define_source :distribution, Remi::DataSource::DataFrame,
12
- fields: {
13
- :group => {},
14
- :weight => {}
15
- }
13
+ source :distribution do
14
+ fields(
15
+ {
16
+ :group => {},
17
+ :weight => {}
18
+ }
19
+ )
20
+ end
16
21
 
17
- define_source :current_population, Remi::DataSource::DataFrame,
18
- fields: {
19
- :group => {},
20
- :count => {}
21
- }
22
+ source :current_population do
23
+ fields(
24
+ {
25
+ :group => {},
26
+ :count => {}
27
+ }
28
+ )
29
+ end
22
30
 
23
- define_target :target_data, Remi::DataTarget::DataFrame
31
+ target :target_data
24
32
 
25
- define_transform :main, sources: :source_data, targets: :target_data do
33
+ transform :main do
26
34
 
27
35
  distribution_hash = distribution.df.map(:row) { |row| [row[:group], row[:weight].to_f] }.to_h
28
36
  current_population_hash = current_population.df.map(:row) { |row| [row[:group], row[:count].to_i] }.to_h
@@ -1,19 +1,22 @@
1
1
  require_relative '../all_jobs_shared'
2
2
 
3
- class PrefixJob
4
- include AllJobsShared
3
+ class PrefixJob < Remi::Job
5
4
 
6
- define_param :prefix, 'prefix'
7
- define_source :source_data, Remi::DataSource::DataFrame,
8
- fields: {
9
- :my_field => {}
10
- }
11
- define_target :target_data, Remi::DataTarget::DataFrame
5
+ param(:prefix) { 'prefix' }
6
+ source :source_data do
7
+ fields(
8
+ {
9
+ :my_field => {}
10
+ }
11
+ )
12
+ end
13
+
14
+ target :target_data
12
15
 
13
- define_transform :main, sources: :source_data, targets: :target_data do
16
+ transform :main do
14
17
  Remi::SourceToTargetMap.apply(source_data.df, target_data.df) do
15
18
  map source(:my_field) .target(:prefixed_field)
16
- .transform(Remi::Transform::Prefix.new(params[:prefix]))
19
+ .transform(Remi::Transform::Prefix.new(job.params[:prefix]))
17
20
  end
18
21
  end
19
22
  end
@@ -1,19 +1,23 @@
1
1
  require_relative '../all_jobs_shared'
2
2
 
3
- class TruncateJob
4
- include AllJobsShared
3
+ class TruncateJob < Remi::Job
5
4
 
6
- define_param :truncate_len, 5
7
- define_source :source_data, Remi::DataSource::DataFrame,
8
- fields: {
9
- :my_field => {}
10
- }
11
- define_target :target_data, Remi::DataTarget::DataFrame
5
+ param(:truncate_len) { 5 }
12
6
 
13
- define_transform :main, sources: :source_data, targets: :target_data do
7
+ source :source_data do
8
+ fields(
9
+ {
10
+ :my_field => {}
11
+ }
12
+ )
13
+ end
14
+
15
+ target :target_data
16
+
17
+ transform :main do
14
18
  Remi::SourceToTargetMap.apply(source_data.df, target_data.df) do
15
19
  map source(:my_field) .target(:truncated_field)
16
- .transform(Remi::Transform::Truncate.new(params[:truncate_len].to_i))
20
+ .transform(Remi::Transform::Truncate.new(job.params[:truncate_len].to_i))
17
21
  end
18
22
  end
19
23
  end
@@ -1,15 +1,18 @@
1
1
  require_relative '../all_jobs_shared'
2
2
 
3
- class TruthyJob
4
- include AllJobsShared
3
+ class TruthyJob < Remi::Job
5
4
 
6
- define_source :source_data, Remi::DataSource::DataFrame,
7
- fields: {
8
- :truthy => {}
9
- }
10
- define_target :target_data, Remi::DataTarget::DataFrame
5
+ source :source_data do
6
+ fields(
7
+ {
8
+ :truthy => {}
9
+ }
10
+ )
11
+ end
12
+
13
+ target :target_data
11
14
 
12
- define_transform :main, sources: :source_data, targets: :target_data do
15
+ transform :main do
13
16
  Remi::SourceToTargetMap.apply(source_data.df, target_data.df) do
14
17
  map source(:truthy) .target(:allow_nils)
15
18
  .transform(Remi::Transform::Truthy.new(allow_nils: true))