remi 0.2.42 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. checksums.yaml +4 -4
  2. data/.yardopts +7 -0
  3. data/Gemfile +1 -1
  4. data/Gemfile.lock +13 -26
  5. data/README.md +1 -1
  6. data/features/step_definitions/remi_step.rb +33 -13
  7. data/features/sub_job_example.feature +24 -0
  8. data/features/sub_transform_example.feature +35 -0
  9. data/features/sub_transform_many_to_many.feature +49 -0
  10. data/features/support/env_app.rb +1 -1
  11. data/jobs/all_jobs_shared.rb +19 -16
  12. data/jobs/copy_source_job.rb +11 -9
  13. data/jobs/csv_file_target_job.rb +10 -9
  14. data/jobs/json_job.rb +18 -14
  15. data/jobs/metadata_job.rb +33 -28
  16. data/jobs/parameters_job.rb +14 -11
  17. data/jobs/sample_job.rb +106 -77
  18. data/jobs/sftp_file_target_job.rb +14 -13
  19. data/jobs/sub_job_example_job.rb +86 -0
  20. data/jobs/sub_transform_example_job.rb +43 -0
  21. data/jobs/sub_transform_many_to_many_job.rb +46 -0
  22. data/jobs/transforms/concatenate_job.rb +16 -12
  23. data/jobs/transforms/data_frame_sieve_job.rb +24 -19
  24. data/jobs/transforms/date_diff_job.rb +15 -11
  25. data/jobs/transforms/nvl_job.rb +16 -12
  26. data/jobs/transforms/parse_date_job.rb +17 -14
  27. data/jobs/transforms/partitioner_job.rb +27 -19
  28. data/jobs/transforms/prefix_job.rb +13 -10
  29. data/jobs/transforms/truncate_job.rb +14 -10
  30. data/jobs/transforms/truthy_job.rb +11 -8
  31. data/lib/remi.rb +25 -11
  32. data/lib/remi/data_frame.rb +4 -4
  33. data/lib/remi/data_frame/daru.rb +1 -37
  34. data/lib/remi/data_subject.rb +234 -48
  35. data/lib/remi/data_subjects/csv_file.rb +171 -0
  36. data/lib/remi/data_subjects/data_frame.rb +106 -0
  37. data/lib/remi/data_subjects/file_system.rb +115 -0
  38. data/lib/remi/data_subjects/local_file.rb +109 -0
  39. data/lib/remi/data_subjects/none.rb +31 -0
  40. data/lib/remi/data_subjects/postgres.rb +186 -0
  41. data/lib/remi/data_subjects/s3_file.rb +84 -0
  42. data/lib/remi/data_subjects/salesforce.rb +211 -0
  43. data/lib/remi/data_subjects/sftp_file.rb +196 -0
  44. data/lib/remi/data_subjects/sub_job.rb +50 -0
  45. data/lib/remi/dsl.rb +74 -0
  46. data/lib/remi/encoder.rb +45 -0
  47. data/lib/remi/extractor.rb +21 -0
  48. data/lib/remi/field_symbolizers.rb +1 -0
  49. data/lib/remi/job.rb +279 -113
  50. data/lib/remi/job/parameters.rb +90 -0
  51. data/lib/remi/job/sub_job.rb +35 -0
  52. data/lib/remi/job/transform.rb +165 -0
  53. data/lib/remi/loader.rb +22 -0
  54. data/lib/remi/monkeys/daru.rb +4 -0
  55. data/lib/remi/parser.rb +44 -0
  56. data/lib/remi/testing/business_rules.rb +17 -23
  57. data/lib/remi/testing/data_stub.rb +2 -2
  58. data/lib/remi/version.rb +1 -1
  59. data/remi.gemspec +3 -0
  60. data/spec/data_subject_spec.rb +475 -11
  61. data/spec/data_subjects/csv_file_spec.rb +69 -0
  62. data/spec/data_subjects/data_frame_spec.rb +52 -0
  63. data/spec/{extractor → data_subjects}/file_system_spec.rb +0 -0
  64. data/spec/{extractor → data_subjects}/local_file_spec.rb +0 -0
  65. data/spec/data_subjects/none_spec.rb +41 -0
  66. data/spec/data_subjects/postgres_spec.rb +80 -0
  67. data/spec/{extractor → data_subjects}/s3_file_spec.rb +0 -0
  68. data/spec/data_subjects/salesforce_spec.rb +117 -0
  69. data/spec/{extractor → data_subjects}/sftp_file_spec.rb +16 -0
  70. data/spec/data_subjects/sub_job_spec.rb +33 -0
  71. data/spec/encoder_spec.rb +38 -0
  72. data/spec/extractor_spec.rb +11 -0
  73. data/spec/fixtures/sf_bulk_helper_stubs.rb +443 -0
  74. data/spec/job/transform_spec.rb +257 -0
  75. data/spec/job_spec.rb +507 -0
  76. data/spec/loader_spec.rb +11 -0
  77. data/spec/parser_spec.rb +38 -0
  78. data/spec/sf_bulk_helper_spec.rb +117 -0
  79. data/spec/testing/data_stub_spec.rb +5 -3
  80. metadata +109 -27
  81. data/features/aggregate.feature +0 -42
  82. data/jobs/aggregate_job.rb +0 -31
  83. data/jobs/transforms/transform_jobs.rb +0 -4
  84. data/lib/remi/data_subject/csv_file.rb +0 -162
  85. data/lib/remi/data_subject/data_frame.rb +0 -52
  86. data/lib/remi/data_subject/postgres.rb +0 -134
  87. data/lib/remi/data_subject/salesforce.rb +0 -136
  88. data/lib/remi/data_subject/sftp_file.rb +0 -65
  89. data/lib/remi/extractor/file_system.rb +0 -92
  90. data/lib/remi/extractor/local_file.rb +0 -43
  91. data/lib/remi/extractor/s3_file.rb +0 -57
  92. data/lib/remi/extractor/sftp_file.rb +0 -83
  93. data/spec/data_subject/csv_file_spec.rb +0 -79
  94. data/spec/data_subject/data_frame.rb +0 -27
@@ -0,0 +1,69 @@
1
+ require_relative '../remi_spec'
2
+
3
+ describe Parser::CsvFile do
4
+
5
+ let(:basic_file) { 'spec/fixtures/basic.csv' }
6
+ let(:basic_dataframe) do
7
+ Remi::DataFrame::Daru.new(
8
+ {
9
+ column_a: ['value 1A', 'value 2A'],
10
+ column_b: ['value 1B', 'value 2B']
11
+ }
12
+ )
13
+ end
14
+
15
+ it 'converts a CSV into a dataframe' do
16
+ csv = Parser::CsvFile.new
17
+ expect(csv.parse(basic_file).to_a).to eq basic_dataframe.to_a
18
+ end
19
+
20
+ it 'adds filename when requested' do
21
+ csv = Parser::CsvFile.new(
22
+ filename_field: :from_file
23
+ )
24
+
25
+ expected_files = [Pathname.new(basic_file).to_s] * 2
26
+ expect(csv.parse(basic_file)[:from_file].to_a).to eq expected_files
27
+ end
28
+
29
+ it 'preprocesses records when required' do
30
+ csv = Parser::CsvFile.new(
31
+ preprocessor: ->(line) { line.gsub(/\\"/,'""') }
32
+ )
33
+
34
+ bad_escape_file = 'spec/fixtures/unsupported_escape.csv'
35
+
36
+ expected_df = Remi::DataFrame::Daru.new(
37
+ {
38
+ column_a: ['value 1A', 'value 2A'],
39
+ column_b: ['value "1B"', 'value "2B"']
40
+ }
41
+ )
42
+ expect(csv.parse(bad_escape_file).to_a).to eq expected_df.to_a
43
+ end
44
+
45
+ it 'accepts standard Ruby CSV options' do
46
+ csv = Parser::CsvFile.new(
47
+ preprocessor: ->(line) { line.gsub(/,/,'|') },
48
+ csv_options: { col_sep: '|' }
49
+ )
50
+
51
+ expect(csv.parse(basic_file).to_a).to eq basic_dataframe.to_a
52
+ end
53
+
54
+ it 'combines multiple csv files into a single dataframe' do
55
+ csv = Parser::CsvFile.new
56
+ two_files = ['spec/fixtures/basic.csv', 'spec/fixtures/basic2.csv']
57
+
58
+ expected_df = Remi::DataFrame::Daru.new(
59
+ {
60
+ column_a: ['value 1A', 'value 2A', 'value 1A', 'value 2A'],
61
+ column_b: ['value 1B', 'value 2B', nil, nil],
62
+ column_c: [nil, nil, 'value 1C', 'value 2C']
63
+ }
64
+ )
65
+
66
+ expect(csv.parse(two_files).to_a).to eq expected_df.to_a
67
+ end
68
+
69
+ end
@@ -0,0 +1,52 @@
1
+ require_relative '../remi_spec'
2
+
3
+ describe Extractor::DataFrame do
4
+ let(:extractor) { Extractor::DataFrame.new(data: data) }
5
+ let(:data) { 'some_data' }
6
+
7
+ context '#data' do
8
+ it 'returns the raw data' do
9
+ expect(extractor.extract.data).to eq data
10
+ end
11
+ end
12
+ end
13
+
14
+ describe Parser::DataFrame do
15
+ let(:fields) do
16
+ {
17
+ brewer: { type: 'text' },
18
+ style: { type: 'text' }
19
+ }
20
+ end
21
+ let(:parser) { Parser::DataFrame.new(fields: fields) }
22
+ let(:df_extract) { double('df_extract') }
23
+ let(:data) {
24
+ [
25
+ [ 'Baerlic', 'IPA' ],
26
+ [ 'Ex Novo', 'Red' ]
27
+ ]
28
+ }
29
+
30
+ before do
31
+ allow(df_extract).to receive(:data) { data }
32
+ end
33
+
34
+ it 'converts the data array into a dataframe' do
35
+ expect(parser.parse df_extract).to be_a Remi::DataFrame::Daru
36
+ end
37
+
38
+ it 'converts the data array into the dataframe' do
39
+ expected_df = Daru::DataFrame.new(
40
+ :brewer => ['Baerlic', 'Ex Novo'],
41
+ :style => ['IPA', 'Red']
42
+ )
43
+ expect(parser.parse(df_extract).to_a).to eq expected_df.to_a
44
+ end
45
+
46
+ end
47
+
48
+ describe Encoder::DataFrame, skip: 'todo' do
49
+ end
50
+
51
+ describe Loader::DataFrame, skip: 'todo' do
52
+ end
@@ -0,0 +1,41 @@
1
+ require_relative '../remi_spec'
2
+
3
+ describe Extractor::None do
4
+ let(:extractor) { Extractor::None.new }
5
+
6
+ context '#extract' do
7
+ it 'does nothing' do
8
+ expect(extractor.extract).to be nil
9
+ end
10
+ end
11
+ end
12
+
13
+ describe Parser::None do
14
+ let(:parser) { Parser::None.new }
15
+
16
+ context '#parse' do
17
+ it 'returns what it is given' do
18
+ expect(parser.parse('some data')).to eq 'some data'
19
+ end
20
+ end
21
+ end
22
+
23
+ describe Encoder::None do
24
+ let(:encoder) { Encoder::None.new }
25
+
26
+ context '#encode' do
27
+ it 'returns what it is given' do
28
+ expect(encoder.encode('some data')).to eq 'some data'
29
+ end
30
+ end
31
+ end
32
+
33
+ describe Loader::None do
34
+ let(:loader) { Loader::None.new }
35
+
36
+ context '#loader' do
37
+ it 'does nothing' do
38
+ expect(loader.load('some data')).to be true
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,80 @@
1
+ require_relative '../remi_spec'
2
+
3
+ describe Extractor::Postgres do
4
+ let(:extractor) { Extractor::Postgres.new(credentials: {}, query: 'some_query') }
5
+ let(:pg_conn) { double('pg_conn') }
6
+ let(:data) { 'some postgres data' }
7
+
8
+ before do
9
+ allow(pg_conn).to receive(:exec) { data }
10
+ allow(extractor).to receive(:connection) { pg_conn }
11
+ end
12
+
13
+ context '#data' do
14
+ it 'returns extracted data' do
15
+ expect(extractor.extract.data).to eq data
16
+ end
17
+ end
18
+ end
19
+
20
+
21
+ describe Parser::Postgres do
22
+ let(:parser) { Parser::Postgres.new }
23
+ let(:pg_extract) { double('pg_extract') }
24
+ let(:data) do
25
+ [
26
+ { 'brewer' => 'Baerlic', 'style' => 'IPA', 'quantity' => 5 },
27
+ { 'brewer' => 'Ex Novo', 'style' => 'Red', 'quantity' => 3 }
28
+ ]
29
+ end
30
+
31
+ before do
32
+ allow(pg_extract).to receive(:data) { data }
33
+ end
34
+
35
+ it 'converts postgres response data into a dataframe' do
36
+ expect(parser.parse pg_extract).to be_a Remi::DataFrame::Daru
37
+ end
38
+
39
+ it 'converted data into the correct dataframe' do
40
+ expected_df = Daru::DataFrame.new(
41
+ :brewer => ['Baerlic', 'Ex Novo'],
42
+ :style => ['IPA', 'Red'],
43
+ :quantity => [5, 3]
44
+ )
45
+ expect(parser.parse(pg_extract).to_a).to eq expected_df.to_a
46
+ end
47
+ end
48
+
49
+
50
+ describe Encoder::Postgres do
51
+ let(:fields) do
52
+ {
53
+ brewer: { type: 'text' },
54
+ style: { type: 'text' },
55
+ quantity: { type: 'integer' }
56
+ }
57
+ end
58
+ let(:encoder) { Encoder::Postgres.new(fields: fields) }
59
+ let(:dataframe) do
60
+ expected_df = Daru::DataFrame.new(
61
+ :brewer => ['Baerlic', 'Ex Novo'],
62
+ :style => ['IPA', 'Red'],
63
+ :quantity => [5, 3]
64
+ )
65
+ end
66
+
67
+ it 'converts the dataframe into an array of strings to be used by the loader' do
68
+ expect(encoder.encode(dataframe).values).to eq [
69
+ "Baerlic\tIPA\t5",
70
+ "Ex Novo\tRed\t3"
71
+ ]
72
+ end
73
+
74
+ it 'builds the field ddl' do
75
+ expect(encoder.encode(dataframe).ddl_fields).to eq 'brewer text, style text, quantity integer'
76
+ end
77
+ end
78
+
79
+ describe Loader::Postgres, skip: 'todo' do
80
+ end
@@ -0,0 +1,117 @@
1
+ require_relative '../remi_spec'
2
+ require 'remi/data_subjects/salesforce.rb'
3
+
4
+ describe Extractor::Salesforce do
5
+ let(:extractor) { Extractor::Salesforce.new(object: :Contact, credentials: {}, query: '') }
6
+ let(:sf_bulk) { double('sf_bulk') }
7
+ let(:data) do
8
+ {
9
+ 'batches' => [
10
+ {
11
+ 'id' => ['751160000065e2BAAQ'],
12
+ 'state' => [ 'Completed' ]
13
+ }
14
+ ]
15
+ }
16
+ end
17
+
18
+ before do
19
+ allow(extractor).to receive(:sf_bulk) { sf_bulk }
20
+ allow(sf_bulk).to receive(:query) { data }
21
+ end
22
+
23
+ context '#data' do
24
+ it 'returns extracted data' do
25
+ expect(extractor.extract.data).to eq data
26
+ end
27
+ end
28
+
29
+
30
+ it 'raises an error if the batch fails' do
31
+ data['batches'].first['state'] = ['Error']
32
+ expect { extractor.extract }.to raise_error Extractor::Salesforce::ExtractError
33
+ end
34
+ end
35
+
36
+
37
+ describe Parser::Salesforce do
38
+ let(:parser) { Parser::Salesforce.new }
39
+ let(:sf_extract) { double('sf_extract') }
40
+ let(:data) do
41
+ {
42
+ 'batches' => [
43
+ {
44
+ 'id' => ['751160000065e2BAAQ'],
45
+ 'state' => [ 'Completed' ],
46
+ 'response' => [
47
+ {
48
+ "xsi:type" => "sObject",
49
+ "type" => [
50
+ "Contact"
51
+ ],
52
+ "Id" => [
53
+ "003G000001cKYaUIA4",
54
+ "003G000001cKYaUIA4"
55
+ ],
56
+ "Student_ID__c" => [
57
+ "FJD385628"
58
+ ]
59
+ },
60
+ {
61
+ "xsi:type" => "sObject",
62
+ "type" => [
63
+ "Contact"
64
+ ],
65
+ "Id" => [
66
+ "003G000001cKYbXIA4",
67
+ "003G000001cKYbXIA4"
68
+ ],
69
+ "Student_ID__c" => [
70
+ { 'xsi:nil' => 'true' }
71
+ ]
72
+ }
73
+ ]
74
+ }
75
+ ]
76
+ }
77
+ end
78
+
79
+ before do
80
+ allow(sf_extract).to receive(:data) { data }
81
+ end
82
+
83
+ it 'converts SalesforceBulkApi response data into a dataframe' do
84
+ expect(parser.parse sf_extract).to be_a Remi::DataFrame::Daru
85
+ end
86
+
87
+ it 'converted data into the correct dataframe' do
88
+ expected_df = Daru::DataFrame.new(
89
+ :Id => ['003G000001cKYaUIA4', '003G000001cKYbXIA4'],
90
+ :Student_ID__c => ['FJD385628', nil]
91
+ )
92
+ expect(parser.parse(sf_extract).to_a).to eq expected_df.to_a
93
+ end
94
+ end
95
+
96
+
97
+ describe Encoder::Salesforce do
98
+ let(:encoder) { Encoder::Salesforce.new }
99
+ let(:dataframe) do
100
+ Daru::DataFrame.new(
101
+ :Id => ['003G000001cKYaUIA4', '003G000001cKYbXIA4'],
102
+ :Student_ID__c => ['FJD385628', nil]
103
+ )
104
+ end
105
+
106
+ it 'converts the dataframe into an array of hashes' do
107
+ expected_result = [
108
+ { :Id => '003G000001cKYaUIA4', :Student_ID__c => 'FJD385628' },
109
+ { :Id => '003G000001cKYbXIA4', :Student_ID__c => nil },
110
+ ]
111
+ expect(encoder.encode dataframe).to eq expected_result
112
+ end
113
+ end
114
+
115
+
116
+ describe Loader::Salesforce, skip: 'todo' do
117
+ end
@@ -82,3 +82,19 @@ describe Extractor::SftpFile do
82
82
  end
83
83
  end
84
84
  end
85
+
86
+
87
+ describe Loader::SftpFile do
88
+ let(:loader) { Loader::SftpFile.new(credentials: {}, remote_path: 'some_path') }
89
+ let(:data) { double('some_data') }
90
+ let(:sftp_session) { instance_double('Net:SFTP::Session') }
91
+
92
+ before do
93
+ allow(Net::SFTP).to receive(:start).and_yield sftp_session
94
+ end
95
+
96
+ it 'loads a csv to a target sftp filesystem' do
97
+ expect(sftp_session).to receive(:upload!).with(data, 'some_path')
98
+ loader.load data
99
+ end
100
+ end
@@ -0,0 +1,33 @@
1
+ require_relative '../remi_spec'
2
+
3
+ describe 'sub jobs' do
4
+ before :each do
5
+ Object.send(:remove_const, :MySubJob) if Object.constants.include?(:MySubJob)
6
+ class MySubJob < Job
7
+ source(:sub_source) {}
8
+ target(:sub_target) {}
9
+ end
10
+ end
11
+
12
+ let(:sub_job) { Job::SubJob.new { MySubJob.new } }
13
+
14
+
15
+ describe Extractor::SubJob do
16
+ let(:extractor) { Extractor::SubJob.new(sub_job: sub_job, data_subject: :sub_target) }
17
+
18
+ it 'returns the data from the sub-job' do
19
+ allow(sub_job.job.sub_target).to receive(:df) { 'sub target df' }
20
+ expect(extractor.extract).to eq 'sub target df'
21
+ end
22
+ end
23
+
24
+ describe Loader::SubJob do
25
+ let(:loader) { Loader::SubJob.new(sub_job: sub_job, data_subject: :sub_source) }
26
+
27
+ it 'populates the sub-job data frame' do
28
+ some_data_frame = Daru::DataFrame.new({ a: [1,2,3] })
29
+ loader.load(some_data_frame)
30
+ expect(sub_job.job.sub_source.df).to eq some_data_frame
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,38 @@
1
+ require_relative 'remi_spec'
2
+
3
+ describe Remi::Encoder do
4
+ let(:field_symbolizer) { double('field_symbolizer') }
5
+ let(:context) { double('context') }
6
+ let(:fields) { double('fields') }
7
+ let(:encoder) { Encoder.new(context: context, fields: fields, field_symbolizer: field_symbolizer) }
8
+
9
+ context '#encode' do
10
+ it 'has an encode method' do
11
+ expect(encoder).respond_to? :encode
12
+ end
13
+ end
14
+
15
+ context '#field_symbolizer' do
16
+ it 'can be set in the constructor' do
17
+ expect(encoder.field_symbolizer).to eq field_symbolizer
18
+ end
19
+
20
+ it 'the field_symbolizer defined in the context takes priority' do
21
+ symbolizer_from_context = double('symbolizer_from_context')
22
+ allow(context).to receive(:field_symbolizer) { symbolizer_from_context }
23
+ expect(encoder.field_symbolizer).to eq symbolizer_from_context
24
+ end
25
+ end
26
+
27
+ context '#fields' do
28
+ it 'can be set in the constructor' do
29
+ expect(encoder.fields).to eq fields
30
+ end
31
+
32
+ it 'the field_symbolizer defined in the context takes priority' do
33
+ fields_from_context = double('fields_from_context')
34
+ allow(context).to receive(:fields) { fields_from_context }
35
+ expect(encoder.fields).to eq fields_from_context
36
+ end
37
+ end
38
+ end