remi 0.2.42 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. checksums.yaml +4 -4
  2. data/.yardopts +7 -0
  3. data/Gemfile +1 -1
  4. data/Gemfile.lock +13 -26
  5. data/README.md +1 -1
  6. data/features/step_definitions/remi_step.rb +33 -13
  7. data/features/sub_job_example.feature +24 -0
  8. data/features/sub_transform_example.feature +35 -0
  9. data/features/sub_transform_many_to_many.feature +49 -0
  10. data/features/support/env_app.rb +1 -1
  11. data/jobs/all_jobs_shared.rb +19 -16
  12. data/jobs/copy_source_job.rb +11 -9
  13. data/jobs/csv_file_target_job.rb +10 -9
  14. data/jobs/json_job.rb +18 -14
  15. data/jobs/metadata_job.rb +33 -28
  16. data/jobs/parameters_job.rb +14 -11
  17. data/jobs/sample_job.rb +106 -77
  18. data/jobs/sftp_file_target_job.rb +14 -13
  19. data/jobs/sub_job_example_job.rb +86 -0
  20. data/jobs/sub_transform_example_job.rb +43 -0
  21. data/jobs/sub_transform_many_to_many_job.rb +46 -0
  22. data/jobs/transforms/concatenate_job.rb +16 -12
  23. data/jobs/transforms/data_frame_sieve_job.rb +24 -19
  24. data/jobs/transforms/date_diff_job.rb +15 -11
  25. data/jobs/transforms/nvl_job.rb +16 -12
  26. data/jobs/transforms/parse_date_job.rb +17 -14
  27. data/jobs/transforms/partitioner_job.rb +27 -19
  28. data/jobs/transforms/prefix_job.rb +13 -10
  29. data/jobs/transforms/truncate_job.rb +14 -10
  30. data/jobs/transforms/truthy_job.rb +11 -8
  31. data/lib/remi.rb +25 -11
  32. data/lib/remi/data_frame.rb +4 -4
  33. data/lib/remi/data_frame/daru.rb +1 -37
  34. data/lib/remi/data_subject.rb +234 -48
  35. data/lib/remi/data_subjects/csv_file.rb +171 -0
  36. data/lib/remi/data_subjects/data_frame.rb +106 -0
  37. data/lib/remi/data_subjects/file_system.rb +115 -0
  38. data/lib/remi/data_subjects/local_file.rb +109 -0
  39. data/lib/remi/data_subjects/none.rb +31 -0
  40. data/lib/remi/data_subjects/postgres.rb +186 -0
  41. data/lib/remi/data_subjects/s3_file.rb +84 -0
  42. data/lib/remi/data_subjects/salesforce.rb +211 -0
  43. data/lib/remi/data_subjects/sftp_file.rb +196 -0
  44. data/lib/remi/data_subjects/sub_job.rb +50 -0
  45. data/lib/remi/dsl.rb +74 -0
  46. data/lib/remi/encoder.rb +45 -0
  47. data/lib/remi/extractor.rb +21 -0
  48. data/lib/remi/field_symbolizers.rb +1 -0
  49. data/lib/remi/job.rb +279 -113
  50. data/lib/remi/job/parameters.rb +90 -0
  51. data/lib/remi/job/sub_job.rb +35 -0
  52. data/lib/remi/job/transform.rb +165 -0
  53. data/lib/remi/loader.rb +22 -0
  54. data/lib/remi/monkeys/daru.rb +4 -0
  55. data/lib/remi/parser.rb +44 -0
  56. data/lib/remi/testing/business_rules.rb +17 -23
  57. data/lib/remi/testing/data_stub.rb +2 -2
  58. data/lib/remi/version.rb +1 -1
  59. data/remi.gemspec +3 -0
  60. data/spec/data_subject_spec.rb +475 -11
  61. data/spec/data_subjects/csv_file_spec.rb +69 -0
  62. data/spec/data_subjects/data_frame_spec.rb +52 -0
  63. data/spec/{extractor → data_subjects}/file_system_spec.rb +0 -0
  64. data/spec/{extractor → data_subjects}/local_file_spec.rb +0 -0
  65. data/spec/data_subjects/none_spec.rb +41 -0
  66. data/spec/data_subjects/postgres_spec.rb +80 -0
  67. data/spec/{extractor → data_subjects}/s3_file_spec.rb +0 -0
  68. data/spec/data_subjects/salesforce_spec.rb +117 -0
  69. data/spec/{extractor → data_subjects}/sftp_file_spec.rb +16 -0
  70. data/spec/data_subjects/sub_job_spec.rb +33 -0
  71. data/spec/encoder_spec.rb +38 -0
  72. data/spec/extractor_spec.rb +11 -0
  73. data/spec/fixtures/sf_bulk_helper_stubs.rb +443 -0
  74. data/spec/job/transform_spec.rb +257 -0
  75. data/spec/job_spec.rb +507 -0
  76. data/spec/loader_spec.rb +11 -0
  77. data/spec/parser_spec.rb +38 -0
  78. data/spec/sf_bulk_helper_spec.rb +117 -0
  79. data/spec/testing/data_stub_spec.rb +5 -3
  80. metadata +109 -27
  81. data/features/aggregate.feature +0 -42
  82. data/jobs/aggregate_job.rb +0 -31
  83. data/jobs/transforms/transform_jobs.rb +0 -4
  84. data/lib/remi/data_subject/csv_file.rb +0 -162
  85. data/lib/remi/data_subject/data_frame.rb +0 -52
  86. data/lib/remi/data_subject/postgres.rb +0 -134
  87. data/lib/remi/data_subject/salesforce.rb +0 -136
  88. data/lib/remi/data_subject/sftp_file.rb +0 -65
  89. data/lib/remi/extractor/file_system.rb +0 -92
  90. data/lib/remi/extractor/local_file.rb +0 -43
  91. data/lib/remi/extractor/s3_file.rb +0 -57
  92. data/lib/remi/extractor/sftp_file.rb +0 -83
  93. data/spec/data_subject/csv_file_spec.rb +0 -79
  94. data/spec/data_subject/data_frame.rb +0 -27
@@ -0,0 +1,43 @@
1
+ require_relative 'all_jobs_shared'
2
+
3
+ class SharedTransforms < Remi::Job
4
+ sub_transform :id_prefixer, prefix: 'DEFAULT' do
5
+ # Declare the required data subjects and data subject fields
6
+ source :st_source, [:st_source_id]
7
+ target :st_target, [:st_prefixed_id]
8
+
9
+ # Do anything that is allowed in a transform with the provided data subjects and fields
10
+ Remi::SourceToTargetMap.apply(st_source.df, st_target.df) do
11
+ map source(:st_source_id) .target(:st_prefixed_id)
12
+ .transform(Remi::Transform::Prefix.new(params[:prefix]))
13
+ end
14
+ end
15
+ end
16
+
17
+ class SubTransformExampleJob < Remi::Job
18
+ param(:job_prefix) { nil }
19
+
20
+ source :my_source do
21
+ fields({ :id => {} })
22
+ end
23
+
24
+ target :my_target do
25
+ fields(
26
+ {
27
+ :id => {},
28
+ :default_id => {}
29
+ }
30
+ )
31
+ end
32
+
33
+ transform :main do
34
+ my_target.df = my_source.df.dup
35
+ sub_trans_params = job.params[:job_prefix].nil? ? {} : { prefix: job.params[:job_prefix] }
36
+
37
+ import SharedTransforms.new.id_prefixer, sub_trans_params do
38
+ map_source_fields :my_source, :st_source, { :id => :st_source_id }
39
+ map_target_fields :st_target, :my_target, { :st_prefixed_id => :default_id }
40
+ end
41
+
42
+ end
43
+ end
@@ -0,0 +1,46 @@
1
+ require_relative 'all_jobs_shared'
2
+
3
+ class SharedManyToManyTransforms < Remi::Job
4
+ sub_transform :unique_values do
5
+ source :fact, [:id]
6
+ source :dimension, [:id, :beer, :style]
7
+ target :unique_beers, [:beer, :count]
8
+ target :unique_styles, [:style, :count]
9
+
10
+ flat_df = fact.df.join(dimension.df, on: [:id], how: :left)
11
+
12
+ unique_beers.df = flat_df.group_by([:beer]).size.detach_index
13
+ unique_beers.df.rename_vectors({ :index => :beer, :values => :count })
14
+
15
+ unique_styles.df = flat_df.group_by([:style]).size.detach_index
16
+ unique_styles.df.rename_vectors({ :index => :style, :values => :count })
17
+ end
18
+ end
19
+
20
+ class SubTransformManyToManyJob < Remi::Job
21
+ source :beer_fact do
22
+ fields({ :fact_sk => {}, :beer_sk => {} })
23
+ end
24
+
25
+ source :beer_dim do
26
+ fields({ :beer_sk => {}, :name => {}, :style => {} })
27
+ end
28
+
29
+ target :beer_count do
30
+ fields({ :name => {}, :count => {} })
31
+ end
32
+
33
+ target :style_count do
34
+ fields({ :style => {}, :count => {} })
35
+ end
36
+
37
+ transform :main do
38
+ import SharedManyToManyTransforms.new.unique_values do
39
+ map_source_fields :beer_fact, :fact, { :beer_sk => :id }
40
+ map_source_fields :beer_dim, :dimension, { :beer_sk => :id, :name => :beer, :style => :style }
41
+
42
+ map_target_fields :unique_beers, :beer_count, { :beer => :name, :count => :count }
43
+ map_target_fields :unique_styles, :style_count, { :style => :style, :count => :count }
44
+ end
45
+ end
46
+ end
@@ -1,21 +1,25 @@
1
1
  require_relative '../all_jobs_shared'
2
2
 
3
- class ConcatenateJob
4
- include AllJobsShared
3
+ class ConcatenateJob < Remi::Job
5
4
 
6
- define_param :delimiter, ','
7
- define_source :source_data, Remi::DataSource::DataFrame,
8
- fields: {
9
- :field1 => {},
10
- :field2 => {},
11
- :field3 => {}
12
- }
13
- define_target :target_data, Remi::DataTarget::DataFrame
5
+ param(:delimiter) { ',' }
14
6
 
15
- define_transform :main, sources: :source_data, targets: :target_data do
7
+ source :source_data do
8
+ fields(
9
+ {
10
+ :field1 => {},
11
+ :field2 => {},
12
+ :field3 => {}
13
+ }
14
+ )
15
+ end
16
+
17
+ target :target_data
18
+
19
+ transform :main do
16
20
  Remi::SourceToTargetMap.apply(source_data.df, target_data.df) do
17
21
  map source(:field1, :field2, :field3) .target(:result_field)
18
- .transform(Remi::Transform::Concatenate.new(params[:delimiter]))
22
+ .transform(Remi::Transform::Concatenate.new(job.params[:delimiter]))
19
23
  end
20
24
  end
21
25
  end
@@ -1,31 +1,36 @@
1
1
  require_relative '../all_jobs_shared'
2
2
 
3
- class DataFrameSieveJob
4
- include AllJobsShared
3
+ class DataFrameSieveJob < Remi::Job
5
4
 
6
- define_source :source_data, Remi::DataSource::DataFrame,
7
- fields: {
8
- :id => {},
9
- :level => {},
10
- :program => {},
11
- :contact => {}
12
- }
13
-
14
- define_source :sieve, Remi::DataSource::DataFrame,
15
- fields: {
16
- :level => {},
17
- :program => {},
18
- :contact => {},
19
- :group => {}
20
- }
5
+ source :source_data do
6
+ fields(
7
+ {
8
+ :id => {},
9
+ :level => {},
10
+ :program => {},
11
+ :contact => {}
12
+ }
13
+ )
14
+ end
21
15
 
22
- define_target :target_data, Remi::DataTarget::DataFrame
16
+ source :sieve do
17
+ fields(
18
+ {
19
+ :level => {},
20
+ :program => {},
21
+ :contact => {},
22
+ :group => {}
23
+ }
24
+ )
25
+ end
23
26
 
24
- define_transform :main, sources: :source_data, targets: :target_data do
27
+ target :target_data
25
28
 
29
+ transform :main do
26
30
  # Hack to convert example to regex
27
31
  sieve.df[:program].recode! { |v| (v || '').match(/\A\/.*\/\Z/) ? /#{v[1...-1]}/ : v }
28
32
 
33
+ target_data.df = source_data.df.dup
29
34
  Remi::SourceToTargetMap.apply(source_data.df, target_data.df) do
30
35
  map source(:level, :program, :contact) .target(:group)
31
36
  .transform(Remi::Transform::DataFrameSieve.new(sieve.df))
@@ -1,24 +1,28 @@
1
1
  require_relative '../all_jobs_shared'
2
2
 
3
- class DateDiffJob
4
- include AllJobsShared
3
+ class DateDiffJob < Remi::Job
5
4
 
6
- define_param :measure, :days
7
- define_source :source_data, Remi::DataSource::DataFrame,
8
- fields: {
9
- :date1 => { type: :date, format: '%Y-%m-%d' },
10
- :date2 => { type: :date, format: '%Y-%m-%d' }
11
- }
12
- define_target :target_data, Remi::DataTarget::DataFrame
5
+ param(:measure) { :days }
13
6
 
14
- define_transform :main, sources: :source_data, targets: :target_data do
7
+ source :source_data do
8
+ fields(
9
+ {
10
+ :date1 => { type: :date, format: '%Y-%m-%d' },
11
+ :date2 => { type: :date, format: '%Y-%m-%d' }
12
+ }
13
+ )
14
+ end
15
+
16
+ target :target_data
17
+
18
+ transform :main do
15
19
  Remi::SourceToTargetMap.apply(source_data.df, target_data.df) do
16
20
  map source(:date1, :date2) .target(:difference)
17
21
  .transform(->(row) {
18
22
  row[:date1] = Date.strptime(row[:date1])
19
23
  row[:date2] = Date.strptime(row[:date2])
20
24
  })
21
- .transform(Remi::Transform::DateDiff.new(params[:measure]))
25
+ .transform(Remi::Transform::DateDiff.new(job.params[:measure]))
22
26
  end
23
27
  end
24
28
  end
@@ -1,21 +1,25 @@
1
1
  require_relative '../all_jobs_shared'
2
2
 
3
- class NvlJob
4
- include AllJobsShared
3
+ class NvlJob < Remi::Job
5
4
 
6
- define_param :default, ''
7
- define_source :source_data, Remi::DataSource::DataFrame,
8
- fields: {
9
- :field1 => {},
10
- :field2 => {},
11
- :field3 => {}
12
- }
13
- define_target :target_data, Remi::DataTarget::DataFrame
5
+ param(:default) { '' }
14
6
 
15
- define_transform :main, sources: :source_data, targets: :target_data do
7
+ source :source_data do
8
+ fields(
9
+ {
10
+ :field1 => {},
11
+ :field2 => {},
12
+ :field3 => {}
13
+ }
14
+ )
15
+ end
16
+
17
+ target :target_data
18
+
19
+ transform :main do
16
20
  Remi::SourceToTargetMap.apply(source_data.df, target_data.df) do
17
21
  map source(:field1, :field2, :field3) .target(:result_field)
18
- .transform(Remi::Transform::Nvl.new(params[:default]))
22
+ .transform(Remi::Transform::Nvl.new(job.params[:default]))
19
23
  end
20
24
  end
21
25
  end
@@ -1,28 +1,31 @@
1
1
  require_relative '../all_jobs_shared'
2
2
 
3
- class ParseDateJob
4
- include AllJobsShared
3
+ class ParseDateJob < Remi::Job
5
4
 
6
- define_param :format, '%Y-%m-%d'
7
- define_param :if_blank, nil
8
- define_source :source_data, Remi::DataSource::DataFrame,
9
- fields: {
10
- :date_string => { type: :date, in_format: params[:format] },
11
- :stubbed_date => { type: :date, in_format: params[:format] }
12
- }
13
- define_target :target_data, Remi::DataTarget::DataFrame
5
+ param(:format) { '%Y-%m-%d' }
6
+ param(:if_blank) { nil }
14
7
 
15
- define_transform :main, sources: :source_data, targets: :target_data do
8
+ source :source_data do
9
+ fields(
10
+ {
11
+ :date_string => { type: :date, in_format: params[:format] },
12
+ :stubbed_date => { type: :date, in_format: params[:format] }
13
+ }
14
+ )
15
+ end
16
+
17
+ target :target_data
16
18
 
19
+ transform :main do
17
20
  # Only needed for testing, would be nice to make it testable without this
18
- params[:if_blank] = ['high', 'low'].include?(params[:if_blank]) ? params[:if_blank].to_sym : params[:if_blank]
21
+ job.params[:if_blank] = ['high', 'low'].include?(job.params[:if_blank]) ? job.params[:if_blank].to_sym : job.params[:if_blank]
19
22
 
20
23
  Remi::SourceToTargetMap.apply(source_data.df, target_data.df) do
21
24
  map source(:date_string) .target(:parsed_date)
22
- .transform(Remi::Transform::ParseDate.new(in_format: params[:format], if_blank: params[:if_blank]))
25
+ .transform(Remi::Transform::ParseDate.new(in_format: job.params[:format], if_blank: job.params[:if_blank]))
23
26
 
24
27
  map source(:stubbed_date) .target(:parsed_stubbed_date)
25
- .transform(Remi::Transform::ParseDate.new(in_format: source_data.fields[:stubbed_date][:in_format], if_blank: params[:if_blank]))
28
+ .transform(Remi::Transform::ParseDate.new(in_format: source_data.fields[:stubbed_date][:in_format], if_blank: job.params[:if_blank]))
26
29
  end
27
30
  end
28
31
  end
@@ -1,28 +1,36 @@
1
1
  require_relative '../all_jobs_shared'
2
2
 
3
- class PartitionerJob
4
- include AllJobsShared
5
-
6
- define_source :source_data, Remi::DataSource::DataFrame,
7
- fields: {
8
- :id => {}
9
- }
3
+ class PartitionerJob < Remi::Job
4
+
5
+ source :source_data do
6
+ fields(
7
+ {
8
+ :id => {}
9
+ }
10
+ )
11
+ end
10
12
 
11
- define_source :distribution, Remi::DataSource::DataFrame,
12
- fields: {
13
- :group => {},
14
- :weight => {}
15
- }
13
+ source :distribution do
14
+ fields(
15
+ {
16
+ :group => {},
17
+ :weight => {}
18
+ }
19
+ )
20
+ end
16
21
 
17
- define_source :current_population, Remi::DataSource::DataFrame,
18
- fields: {
19
- :group => {},
20
- :count => {}
21
- }
22
+ source :current_population do
23
+ fields(
24
+ {
25
+ :group => {},
26
+ :count => {}
27
+ }
28
+ )
29
+ end
22
30
 
23
- define_target :target_data, Remi::DataTarget::DataFrame
31
+ target :target_data
24
32
 
25
- define_transform :main, sources: :source_data, targets: :target_data do
33
+ transform :main do
26
34
 
27
35
  distribution_hash = distribution.df.map(:row) { |row| [row[:group], row[:weight].to_f] }.to_h
28
36
  current_population_hash = current_population.df.map(:row) { |row| [row[:group], row[:count].to_i] }.to_h
@@ -1,19 +1,22 @@
1
1
  require_relative '../all_jobs_shared'
2
2
 
3
- class PrefixJob
4
- include AllJobsShared
3
+ class PrefixJob < Remi::Job
5
4
 
6
- define_param :prefix, 'prefix'
7
- define_source :source_data, Remi::DataSource::DataFrame,
8
- fields: {
9
- :my_field => {}
10
- }
11
- define_target :target_data, Remi::DataTarget::DataFrame
5
+ param(:prefix) { 'prefix' }
6
+ source :source_data do
7
+ fields(
8
+ {
9
+ :my_field => {}
10
+ }
11
+ )
12
+ end
13
+
14
+ target :target_data
12
15
 
13
- define_transform :main, sources: :source_data, targets: :target_data do
16
+ transform :main do
14
17
  Remi::SourceToTargetMap.apply(source_data.df, target_data.df) do
15
18
  map source(:my_field) .target(:prefixed_field)
16
- .transform(Remi::Transform::Prefix.new(params[:prefix]))
19
+ .transform(Remi::Transform::Prefix.new(job.params[:prefix]))
17
20
  end
18
21
  end
19
22
  end
@@ -1,19 +1,23 @@
1
1
  require_relative '../all_jobs_shared'
2
2
 
3
- class TruncateJob
4
- include AllJobsShared
3
+ class TruncateJob < Remi::Job
5
4
 
6
- define_param :truncate_len, 5
7
- define_source :source_data, Remi::DataSource::DataFrame,
8
- fields: {
9
- :my_field => {}
10
- }
11
- define_target :target_data, Remi::DataTarget::DataFrame
5
+ param(:truncate_len) { 5 }
12
6
 
13
- define_transform :main, sources: :source_data, targets: :target_data do
7
+ source :source_data do
8
+ fields(
9
+ {
10
+ :my_field => {}
11
+ }
12
+ )
13
+ end
14
+
15
+ target :target_data
16
+
17
+ transform :main do
14
18
  Remi::SourceToTargetMap.apply(source_data.df, target_data.df) do
15
19
  map source(:my_field) .target(:truncated_field)
16
- .transform(Remi::Transform::Truncate.new(params[:truncate_len].to_i))
20
+ .transform(Remi::Transform::Truncate.new(job.params[:truncate_len].to_i))
17
21
  end
18
22
  end
19
23
  end
@@ -1,15 +1,18 @@
1
1
  require_relative '../all_jobs_shared'
2
2
 
3
- class TruthyJob
4
- include AllJobsShared
3
+ class TruthyJob < Remi::Job
5
4
 
6
- define_source :source_data, Remi::DataSource::DataFrame,
7
- fields: {
8
- :truthy => {}
9
- }
10
- define_target :target_data, Remi::DataTarget::DataFrame
5
+ source :source_data do
6
+ fields(
7
+ {
8
+ :truthy => {}
9
+ }
10
+ )
11
+ end
12
+
13
+ target :target_data
11
14
 
12
- define_transform :main, sources: :source_data, targets: :target_data do
15
+ transform :main do
13
16
  Remi::SourceToTargetMap.apply(source_data.df, target_data.df) do
14
17
  map source(:truthy) .target(:allow_nils)
15
18
  .transform(Remi::Transform::Truthy.new(allow_nils: true))