remi 0.2.42 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. checksums.yaml +4 -4
  2. data/.yardopts +7 -0
  3. data/Gemfile +1 -1
  4. data/Gemfile.lock +13 -26
  5. data/README.md +1 -1
  6. data/features/step_definitions/remi_step.rb +33 -13
  7. data/features/sub_job_example.feature +24 -0
  8. data/features/sub_transform_example.feature +35 -0
  9. data/features/sub_transform_many_to_many.feature +49 -0
  10. data/features/support/env_app.rb +1 -1
  11. data/jobs/all_jobs_shared.rb +19 -16
  12. data/jobs/copy_source_job.rb +11 -9
  13. data/jobs/csv_file_target_job.rb +10 -9
  14. data/jobs/json_job.rb +18 -14
  15. data/jobs/metadata_job.rb +33 -28
  16. data/jobs/parameters_job.rb +14 -11
  17. data/jobs/sample_job.rb +106 -77
  18. data/jobs/sftp_file_target_job.rb +14 -13
  19. data/jobs/sub_job_example_job.rb +86 -0
  20. data/jobs/sub_transform_example_job.rb +43 -0
  21. data/jobs/sub_transform_many_to_many_job.rb +46 -0
  22. data/jobs/transforms/concatenate_job.rb +16 -12
  23. data/jobs/transforms/data_frame_sieve_job.rb +24 -19
  24. data/jobs/transforms/date_diff_job.rb +15 -11
  25. data/jobs/transforms/nvl_job.rb +16 -12
  26. data/jobs/transforms/parse_date_job.rb +17 -14
  27. data/jobs/transforms/partitioner_job.rb +27 -19
  28. data/jobs/transforms/prefix_job.rb +13 -10
  29. data/jobs/transforms/truncate_job.rb +14 -10
  30. data/jobs/transforms/truthy_job.rb +11 -8
  31. data/lib/remi.rb +25 -11
  32. data/lib/remi/data_frame.rb +4 -4
  33. data/lib/remi/data_frame/daru.rb +1 -37
  34. data/lib/remi/data_subject.rb +234 -48
  35. data/lib/remi/data_subjects/csv_file.rb +171 -0
  36. data/lib/remi/data_subjects/data_frame.rb +106 -0
  37. data/lib/remi/data_subjects/file_system.rb +115 -0
  38. data/lib/remi/data_subjects/local_file.rb +109 -0
  39. data/lib/remi/data_subjects/none.rb +31 -0
  40. data/lib/remi/data_subjects/postgres.rb +186 -0
  41. data/lib/remi/data_subjects/s3_file.rb +84 -0
  42. data/lib/remi/data_subjects/salesforce.rb +211 -0
  43. data/lib/remi/data_subjects/sftp_file.rb +196 -0
  44. data/lib/remi/data_subjects/sub_job.rb +50 -0
  45. data/lib/remi/dsl.rb +74 -0
  46. data/lib/remi/encoder.rb +45 -0
  47. data/lib/remi/extractor.rb +21 -0
  48. data/lib/remi/field_symbolizers.rb +1 -0
  49. data/lib/remi/job.rb +279 -113
  50. data/lib/remi/job/parameters.rb +90 -0
  51. data/lib/remi/job/sub_job.rb +35 -0
  52. data/lib/remi/job/transform.rb +165 -0
  53. data/lib/remi/loader.rb +22 -0
  54. data/lib/remi/monkeys/daru.rb +4 -0
  55. data/lib/remi/parser.rb +44 -0
  56. data/lib/remi/testing/business_rules.rb +17 -23
  57. data/lib/remi/testing/data_stub.rb +2 -2
  58. data/lib/remi/version.rb +1 -1
  59. data/remi.gemspec +3 -0
  60. data/spec/data_subject_spec.rb +475 -11
  61. data/spec/data_subjects/csv_file_spec.rb +69 -0
  62. data/spec/data_subjects/data_frame_spec.rb +52 -0
  63. data/spec/{extractor → data_subjects}/file_system_spec.rb +0 -0
  64. data/spec/{extractor → data_subjects}/local_file_spec.rb +0 -0
  65. data/spec/data_subjects/none_spec.rb +41 -0
  66. data/spec/data_subjects/postgres_spec.rb +80 -0
  67. data/spec/{extractor → data_subjects}/s3_file_spec.rb +0 -0
  68. data/spec/data_subjects/salesforce_spec.rb +117 -0
  69. data/spec/{extractor → data_subjects}/sftp_file_spec.rb +16 -0
  70. data/spec/data_subjects/sub_job_spec.rb +33 -0
  71. data/spec/encoder_spec.rb +38 -0
  72. data/spec/extractor_spec.rb +11 -0
  73. data/spec/fixtures/sf_bulk_helper_stubs.rb +443 -0
  74. data/spec/job/transform_spec.rb +257 -0
  75. data/spec/job_spec.rb +507 -0
  76. data/spec/loader_spec.rb +11 -0
  77. data/spec/parser_spec.rb +38 -0
  78. data/spec/sf_bulk_helper_spec.rb +117 -0
  79. data/spec/testing/data_stub_spec.rb +5 -3
  80. metadata +109 -27
  81. data/features/aggregate.feature +0 -42
  82. data/jobs/aggregate_job.rb +0 -31
  83. data/jobs/transforms/transform_jobs.rb +0 -4
  84. data/lib/remi/data_subject/csv_file.rb +0 -162
  85. data/lib/remi/data_subject/data_frame.rb +0 -52
  86. data/lib/remi/data_subject/postgres.rb +0 -134
  87. data/lib/remi/data_subject/salesforce.rb +0 -136
  88. data/lib/remi/data_subject/sftp_file.rb +0 -65
  89. data/lib/remi/extractor/file_system.rb +0 -92
  90. data/lib/remi/extractor/local_file.rb +0 -43
  91. data/lib/remi/extractor/s3_file.rb +0 -57
  92. data/lib/remi/extractor/sftp_file.rb +0 -83
  93. data/spec/data_subject/csv_file_spec.rb +0 -79
  94. data/spec/data_subject/data_frame.rb +0 -27
@@ -1,31 +0,0 @@
1
- require_relative 'all_jobs_shared'
2
-
3
- class AggregateJob
4
- include AllJobsShared
5
-
6
- define_source :source_data, Remi::DataSource::DataFrame
7
- define_target :target_data, Remi::DataTarget::DataFrame
8
- define_target :multigroup_target_data, Remi::DataTarget::DataFrame
9
-
10
- define_transform :main, sources: :source_data, targets: :target_data do
11
- mymin = lambda do |field, df, group_key, indicies|
12
- values = indicies.map { |idx| df.row[idx][field] }
13
- "Group #{group_key} has a minimum value of #{values.min}"
14
- end
15
-
16
- # Daru groups don't use the index of the dataframe when returning groups (WTF?).
17
- # Instead they return the position of the record in the dataframe. Here, we
18
- # shift the indexes which causes a failure if this artifact is not handled
19
- # properly in the aggregate function
20
- source_data.df.index = Daru::Index.new(1.upto(source_data.df.size).to_a)
21
-
22
- target_data.df = source_data.df.aggregate(by: :alpha, func: mymin.curry.(:year)).detach_index
23
- target_data.df.vectors = Daru::Index.new([:alpha, :year])
24
-
25
- multigroup_target_data.df = source_data.df.aggregate(by: [:alpha,:beta], func: mymin.curry.(:year)).detach_index
26
- multigroup_target_data.df.vectors = Daru::Index.new([:alpha_beta, :year])
27
-
28
-
29
-
30
- end
31
- end
@@ -1,4 +0,0 @@
1
- require_relative 'prefix_job'
2
- require_relative 'parse_date_job'
3
- require_relative 'date_diff_job'
4
- require_relative 'nvl_job'
@@ -1,162 +0,0 @@
1
- module Remi
2
- module DataSubject::CsvFile
3
- def self.included(base)
4
- base.extend(CsvFileClassMethods)
5
- end
6
-
7
- def field_symbolizer
8
- self.class.default_csv_options[:header_converters]
9
- end
10
-
11
- module CsvFileClassMethods
12
- def default_csv_options
13
- @default_csv_options ||= CSV::DEFAULT_OPTIONS.merge({
14
- headers: true,
15
- header_converters: Remi::FieldSymbolizers[:standard],
16
- converters: [],
17
- col_sep: ',',
18
- encoding: 'UTF-8',
19
- quote_char: '"'
20
- })
21
- end
22
- end
23
- end
24
-
25
-
26
-
27
-
28
-
29
- class DataSource::CsvFile < DataSource
30
- include Remi::DataSubject::CsvFile
31
-
32
- def initialize(*args, **kargs, &block)
33
- super
34
- init_csv_file(*args, **kargs, &block)
35
- end
36
-
37
- attr_reader :extractor
38
- attr_reader :csv_options
39
-
40
- # Public: Called to extract data from the source.
41
- #
42
- # Returns data in a format that can be used to create a dataframe.
43
- def extract!
44
- @extract = Array(@extractor.extract)
45
- end
46
-
47
- # Public: Converts extracted data to a dataframe.
48
- # Currently only supports Daru DataFrames.
49
- #
50
- # Returns a Remi::DataFrame
51
- def to_dataframe
52
- # Assumes that each file has exactly the same structure
53
- result_df = nil
54
- extract.each_with_index do |filename, idx|
55
- filename = filename.to_s
56
-
57
- @logger.info "Converting #{filename} to a dataframe"
58
- processed_filename = preprocess(filename)
59
- csv_df = Daru::DataFrame.from_csv processed_filename, @csv_options
60
-
61
- csv_df[@filename_field] = Daru::Vector.new([filename] * csv_df.size, index: csv_df.index) if @filename_field
62
- if idx == 0
63
- result_df = csv_df
64
- else
65
- result_df = result_df.concat csv_df
66
- end
67
- end
68
-
69
- Remi::DataFrame.create(:daru, result_df)
70
- end
71
-
72
-
73
-
74
- def extractor=(arg)
75
- @extractor = arg.respond_to?(:extract) ? arg : Extractor::LocalFile.new(remote_path: arg.to_s)
76
- end
77
-
78
- # Only going to support single file for now
79
- def source_filename
80
- raise "Multiple source files detected" if extract.size > 1
81
- @source_filename ||= extract.first
82
- end
83
-
84
- def first_line
85
- # Readline assumes \n line endings. Strip out \r if it is a DOS file.
86
- @first_line ||= File.open(source_filename) do |f|
87
- f.readline.gsub(/\r/,'')
88
- end
89
- end
90
-
91
- def headers
92
- @headers ||= CSV.open(source_filename, 'r', source_csv_options) { |csv| csv.first }.headers
93
- end
94
-
95
- def valid_headers?
96
- (fields.keys - headers).empty?
97
- end
98
-
99
-
100
-
101
- private
102
-
103
- def preprocess(filename)
104
- return filename unless @preprocessor
105
- @logger.info "Preprocessing #{filename}"
106
- tmp_filename = File.join(Remi::Settings.work_dir, "#{Pathname.new(filename).basename}-#{SecureRandom.uuid}")
107
-
108
- dirname = Pathname.new(tmp_filename).dirname
109
- FileUtils.mkdir_p(dirname) unless File.directory? dirname
110
-
111
- File.open(tmp_filename, 'w') do |outfile|
112
- File.foreach(filename) do |in_line|
113
- outfile.write @preprocessor.call(in_line)
114
- end
115
- end
116
-
117
- tmp_filename
118
- end
119
-
120
- def init_csv_file(*args, extractor:, csv_options: {}, filename_field: nil, preprocessor: nil, **kargs, &block)
121
- self.extractor = extractor
122
- @csv_options = self.class.default_csv_options.merge(csv_options)
123
- @filename_field = filename_field
124
- @preprocessor = preprocessor
125
- end
126
- end
127
-
128
-
129
-
130
-
131
-
132
- class DataTarget::CsvFile < DataTarget
133
- include ::Remi::DataSubject::CsvFile
134
-
135
- default_csv_options[:row_sep] = "\n"
136
-
137
- def initialize(*args, **kargs, &block)
138
- super
139
- init_csv_file(*args, **kargs, &block)
140
- end
141
-
142
- attr_reader :csv_options
143
-
144
- # Public: Performs the load operation, regardless of whether it has
145
- # already executed.
146
- #
147
- # Returns true if the load operation was successful
148
- def load!
149
- @logger.info "Writing CSV file #{@path}"
150
- df.write_csv @path, @csv_options
151
- true
152
- end
153
-
154
-
155
- private
156
-
157
- def init_csv_file(*args, path:, csv_options: {}, **kargs, &block)
158
- @path = path
159
- @csv_options = self.class.default_csv_options.merge(csv_options)
160
- end
161
- end
162
- end
@@ -1,52 +0,0 @@
1
- module Remi
2
-
3
- class DataSource::DataFrame < DataSource
4
-
5
- def initialize(*args, **kargs, &block)
6
- super
7
- init_df(*args, **kargs, &block)
8
- end
9
-
10
- # Public: Called to extract data from the source.
11
- #
12
- # Returns data in a format that can be used to create a dataframe.
13
- def extract!
14
- @extract = @data.transpose
15
- end
16
-
17
- # Public: Converts extracted data to a dataframe
18
- #
19
- # Returns a Remi::DataFrame
20
- def to_dataframe
21
- DataFrame.create(@remi_df_type, extract, order: @fields.keys)
22
- end
23
-
24
- private
25
-
26
- def init_df(*args, data: [], **kargs, &block)
27
- @data = data
28
- end
29
- end
30
-
31
-
32
- class DataTarget::DataFrame < DataTarget
33
-
34
- def initialize(*args, **kargs, &block)
35
- super
36
- init_df(*args, **kargs, &block)
37
- end
38
-
39
- # Public: Performs the load operation, regardless of whether it has
40
- # already executed.
41
- #
42
- # Returns true if the load operation was successful
43
- def load!
44
- true
45
- end
46
-
47
- private
48
-
49
- def init_df(*args, **kargs, &block)
50
- end
51
- end
52
- end
@@ -1,134 +0,0 @@
1
- module Remi
2
- module DataSubject::Postgres
3
- def connection
4
- @connection ||= PG.connect(
5
- host: @credentials[:host] || 'localhost',
6
- port: @credentials[:port] || 5432,
7
- dbname: @credentials[:dbname],
8
- user: @credentials[:user] || `whoami`.chomp,
9
- password: @credentials[:password],
10
- sslmode: @credentials[:sslmode] || 'allow'
11
- )
12
- end
13
- end
14
-
15
-
16
- class DataSource::Postgres < DataSource
17
- include Remi::DataSubject::Postgres
18
-
19
-
20
- def initialize(*args, **kargs, &block)
21
- super
22
- init_postgres(*args, **kargs, &block)
23
- end
24
-
25
- # Public: Called to extract data from the source.
26
- #
27
- # Returns data in a format that can be used to create a dataframe.
28
- def extract!
29
- @logger.info "Executing query #{@query}"
30
- @extract = connection.exec @query
31
- end
32
-
33
- # Public: Converts extracted data to a dataframe.
34
- # Currently only supports Daru DataFrames.
35
- #
36
- # Returns a Remi::DataFrame
37
- def to_dataframe
38
- # Performance for larger sets could be improved by using bulk query (via COPY)
39
- @logger.info "Converting query to a dataframe"
40
-
41
- hash_array = {}
42
- extract.each do |row|
43
- row.each do |field, value|
44
- (hash_array[field_symbolizer.call(field)] ||= []) << value
45
- end
46
- end
47
-
48
- # After converting to DF, clear the PG results to save memory.
49
- extract.clear
50
-
51
- Remi::DataFrame.create(@remi_df_type, hash_array, order: hash_array.keys)
52
- end
53
-
54
-
55
- private
56
-
57
- def init_postgres(*args, credentials:, query:, **kargs, &block)
58
- @credentials = credentials
59
- @query = query
60
- end
61
- end
62
-
63
-
64
-
65
- # VERY PRELIMINARY IMPLEMENTAtION - ONLY LOADS TO TEMP TABLES
66
- # IT IS THEN UP TO THE USER TO DO ELT TO LOAD THE FINAL TABLE
67
- class DataTarget::Postgres < DataTarget
68
- include Remi::DataSubject::Postgres
69
-
70
- def initialize(*args, **kargs, &block)
71
- super
72
- init_postgres(*args, **kargs, &block)
73
- end
74
-
75
- # Public: Performs the load operation, regardless of whether it has
76
- # already executed.
77
- #
78
- # Returns true if the load operation was successful
79
- def load!
80
- @logger.info "Performing postgres load to table #{@table_name}"
81
- create_target_table
82
- load_target_table
83
-
84
- true
85
- end
86
-
87
-
88
- private
89
-
90
- def init_postgres(*args, credentials:, table_name:, **kargs, &block)
91
- @credentials = credentials
92
- @table_name = table_name
93
- end
94
-
95
- def fields_with_type_ddl
96
- @fields.map { |k,v| "#{k} #{v[:type]}" }.join(', ')
97
- end
98
-
99
- def create_target_table
100
- create_table_sql = <<-EOT
101
- CREATE TEMPORARY TABLE #{@table_name} (
102
- #{fields_with_type_ddl}
103
- )
104
- EOT
105
-
106
- @logger.info create_table_sql
107
- connection.exec create_table_sql
108
- end
109
-
110
- def load_target_table
111
- connection.copy_data "COPY #{@table_name} (#{@fields.keys.join(', ')}) FROM STDIN" do
112
- df.each(:row) do |row|
113
- row_str = @fields.keys.map do |field|
114
- field = row[field]
115
- case
116
- when field.respond_to?(:strftime)
117
- field.strftime('%Y-%m-%d %H:%M:%S')
118
- when field.respond_to?(:map)
119
- field.to_json.gsub("\t", '\t')
120
- when field.blank? && !field.nil?
121
- ''
122
- when field.nil?
123
- '\N'
124
- else
125
- field.to_s.gsub(/[\t\n\r]/, "\t" => '\t', "\n" => '\n', "\r" => '\r')
126
- end
127
- end.join("\t")
128
-
129
- connection.put_copy_data row_str + "\n"
130
- end
131
- end
132
- end
133
- end
134
- end
@@ -1,136 +0,0 @@
1
- require 'restforce'
2
- require 'salesforce_bulk_api'
3
- require 'remi/sf_bulk_helper'
4
-
5
- module Remi
6
- module DataSubject::Salesforce
7
- def field_symbolizer
8
- Remi::FieldSymbolizers[:salesforce]
9
- end
10
-
11
- def restforce_client
12
- @restforce_client ||= begin
13
- client = Restforce.new(@credentials)
14
-
15
- #run a dummy query to initiate a connection. Workaround for Bulk API problem
16
- # https://github.com/yatish27/salesforce_bulk_api/issues/33
17
- client.query('SELECT Id FROM Contact LIMIT 1')
18
- client
19
- end
20
- end
21
- end
22
-
23
-
24
- class DataSource::Salesforce < DataSource
25
- include Remi::DataSubject::Salesforce
26
-
27
- def initialize(*args, **kargs, &block)
28
- super
29
- init_salesforce(*args, **kargs, &block)
30
- end
31
-
32
- # Public: Called to extract data from the source.
33
- #
34
- # Returns data in a format that can be used to create a dataframe.
35
- def extract!
36
- @extract = sf_bulk.query(@sfo, @query, 10000)
37
-
38
- check_for_errors(@extract)
39
- @extract
40
- end
41
-
42
- def sf_bulk
43
- @sf_bulk ||= SalesforceBulkApi::Api.new(restforce_client).tap { |o| o.connection.set_status_throttle(5) }
44
- end
45
-
46
- # Public: Converts extracted data to a dataframe.
47
- # Currently only supports Daru DataFrames.
48
- #
49
- # Returns a Remi::DataFrame
50
- def to_dataframe
51
- @logger.info "Converting salesforce query results to a dataframe"
52
-
53
- hash_array = {}
54
- extract['batches'].each do |batch|
55
- next unless batch['response']
56
-
57
- batch['response'].each do |record|
58
- record.each do |field, value|
59
- next if ['xsi:type','type'].include? field
60
- (hash_array[field.to_sym] ||= []) << case value.first
61
- when Hash
62
- value.first["xsi:nil"] == "true" ? nil : value.first
63
- else
64
- value.first
65
- end
66
- end
67
- end
68
-
69
- # delete raw result at end of processing to free memory
70
- batch['response'] = nil
71
- end
72
-
73
- Remi::DataFrame.create(@remi_df_type, hash_array, order: hash_array.keys)
74
- end
75
-
76
-
77
- private
78
-
79
- def init_salesforce(*args, object:, query:, credentials:, api: :bulk, **kargs, &block)
80
- @sfo = object
81
- @query = query
82
- @credentials = credentials
83
- @api = api
84
- end
85
-
86
- def check_for_errors(sf_result)
87
- sf_result['batches'].each do |batch|
88
- raise "Error with batch #{batch['id']} - #{batch['state']}: #{batch['stateMessage']}" unless batch['state'].first == 'Completed'
89
- end
90
- end
91
- end
92
-
93
-
94
- class DataTarget::Salesforce < DataTarget
95
- include Remi::DataSubject::Salesforce
96
-
97
- def initialize(*args, **kargs, &block)
98
- super
99
- init_salesforce(*args, **kargs, &block)
100
- end
101
-
102
- # Public: Performs the load operation, regardless of whether it has
103
- # already executed.
104
- #
105
- # Returns true if the load operation was successful
106
- def load!
107
- @logger.info "Performing Salesforce #{@operation} on object #{@sfo}"
108
-
109
- df_as_array_of_hashes = df.to_a[0] # This probably wouldn't work with a non-Daru df
110
- if @operation == :update
111
- Remi::SfBulkHelper::SfBulkUpdate.update(restforce_client, @sfo, df_as_array_of_hashes, batch_size: @batch_size, logger: @logger)
112
- elsif @operation == :create
113
- Remi::SfBulkHelper::SfBulkCreate.create(restforce_client, @sfo, df_as_array_of_hashes, batch_size: @batch_size, logger: @logger)
114
- elsif @operation == :upsert
115
- Remi::SfBulkHelper::SfBulkUpsert.upsert(restforce_client, @sfo, df_as_array_of_hashes, batch_size: @batch_size, external_id: @external_id, logger: @logger)
116
- else
117
- raise ArgumentError, "Unknown operation: #{@operation}"
118
- end
119
-
120
- true
121
- end
122
-
123
- private
124
-
125
- def init_salesforce(*args, object:, operation:, credentials:, batch_size: 5000, external_id: 'Id', api: :bulk, **kargs, &block)
126
- @sfo = object
127
- @operation = operation
128
- @batch_size = batch_size
129
- @external_id = external_id
130
- @credentials = credentials
131
- @api = api
132
- end
133
- end
134
-
135
-
136
- end