remi 0.2.27 → 0.2.28

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +3 -0
  3. data/Gemfile.lock +34 -5
  4. data/features/metadata.feature +17 -0
  5. data/features/step_definitions/remi_step.rb +6 -6
  6. data/features/transforms/date_diff.feature +1 -0
  7. data/jobs/aggregate_job.rb +0 -1
  8. data/jobs/all_jobs_shared.rb +0 -2
  9. data/jobs/copy_source_job.rb +0 -1
  10. data/jobs/csv_file_target_job.rb +0 -1
  11. data/jobs/metadata_job.rb +60 -0
  12. data/jobs/parameters_job.rb +1 -1
  13. data/jobs/sample_job.rb +19 -20
  14. data/jobs/sftp_file_target_job.rb +0 -1
  15. data/jobs/transforms/date_diff_job.rb +1 -1
  16. data/jobs/transforms/nvl_job.rb +1 -1
  17. data/jobs/transforms/parse_date_job.rb +7 -4
  18. data/jobs/transforms/prefix_job.rb +1 -1
  19. data/jobs/transforms/truncate_job.rb +1 -1
  20. data/lib/remi.rb +10 -15
  21. data/lib/remi/cucumber/business_rules.rb +23 -23
  22. data/lib/remi/cucumber/data_source.rb +2 -1
  23. data/lib/remi/data_frame.rb +36 -0
  24. data/lib/remi/data_frame/daru.rb +67 -0
  25. data/lib/remi/data_subject.rb +71 -10
  26. data/lib/remi/data_subject/csv_file.rb +151 -0
  27. data/lib/remi/data_subject/data_frame.rb +53 -0
  28. data/lib/remi/data_subject/postgres.rb +136 -0
  29. data/lib/remi/data_subject/salesforce.rb +136 -0
  30. data/lib/remi/data_subject/sftp_file.rb +66 -0
  31. data/lib/remi/fields.rb +8 -0
  32. data/lib/remi/source_to_target_map.rb +56 -32
  33. data/lib/remi/transform.rb +426 -83
  34. data/lib/remi/version.rb +1 -1
  35. data/remi.gemspec +2 -1
  36. data/spec/metadata_spec.rb +62 -0
  37. metadata +15 -28
  38. data/lib/remi/data_source.rb +0 -13
  39. data/lib/remi/data_source/csv_file.rb +0 -101
  40. data/lib/remi/data_source/data_frame.rb +0 -16
  41. data/lib/remi/data_source/postgres.rb +0 -58
  42. data/lib/remi/data_source/salesforce.rb +0 -87
  43. data/lib/remi/data_target.rb +0 -15
  44. data/lib/remi/data_target/csv_file.rb +0 -42
  45. data/lib/remi/data_target/data_frame.rb +0 -14
  46. data/lib/remi/data_target/postgres.rb +0 -74
  47. data/lib/remi/data_target/salesforce.rb +0 -54
  48. data/lib/remi/data_target/sftp_file.rb +0 -54
  49. data/lib/remi/refinements/daru.rb +0 -85
@@ -1,3 +1,3 @@
1
1
  module Remi
2
- VERSION = '0.2.27'
2
+ VERSION = '0.2.28'
3
3
  end
@@ -13,7 +13,8 @@ Gem::Specification.new do |s|
13
13
  s.description = "Data manipulation and ETL in Ruby"
14
14
 
15
15
  s.rubyforge_project = "Remi"
16
- s.add_runtime_dependency "daru", ["0.1.2"]
16
+ # Making use of a fork for now
17
+ # s.add_runtime_dependency "daru", ["0.1.2"]
17
18
 
18
19
  s.add_runtime_dependency 'bond', ['~> 0.5']
19
20
  s.add_runtime_dependency 'docile', ['~> 1.1']
@@ -0,0 +1,62 @@
1
+ =begin
2
+
3
+ this is probably mostly about data subjects
4
+
5
+
6
+ calling fields on the data subject should return only the fields defined, even if there
7
+ are additional fields on the dataframe
8
+
9
+ dataframe metadata is merged into field metadata, always with a preference for field metadata
10
+
11
+ metadata propagates through 1:1 STTMs
12
+
13
+ metadata propagates through intermediate dataframes that are not data subjects
14
+
15
+
16
+ puts '---- ROUTE 1 - direct -----'
17
+ out_activity.df = Remi::DataFrame.create(:daru, [], order: out_activity.fields.keys)#, index: in_activity.df.index)
18
+ Remi::SourceToTargetMap.apply(in_activity.df, out_activity.df) do
19
+ # map source(:activity_id, :student_id) .target(:activity_id, :student_id)
20
+ map source(:activity_id) .target(:activity_id)
21
+ .transform(->(v) { "-#{v}-" })
22
+ # enforce types needs to be based on the "fields" for the target
23
+ # I might have to convert any Daru Dataframe to Remi dataframes in the STTM
24
+
25
+ # .transform(Remi::Transform[:enforce_types].(on_error: :ignore))
26
+
27
+ map source(:student_id) .target(:student_id)
28
+ map source(:student_dob) .target(:student_dob)
29
+ end
30
+
31
+
32
+ puts "out_activity.fields: #{out_activity.fields}"
33
+ puts "out_activity.df metadata: #{out_activity.df.vector_metadata}"
34
+ out_activity.df = out_activity.df[*(out_activity.fields.keys)]
35
+ puts "out_activity.fields: #{out_activity.fields}"
36
+ puts "out_activity.df metadata: #{out_activity.df.vector_metadata}"
37
+ IRuby.display out_activity.df, type: 'text/html'
38
+
39
+
40
+ puts '---- ROUTE 2 - via work_df -----'
41
+ work_df = Remi::DataFrame.create(:daru, [], order: out_activity.fields.keys)#, index: in_activity.df.index)
42
+ Remi::SourceToTargetMap.apply(in_activity.df, work_df) do
43
+ map source(:activity_id) .target(:activity_id)
44
+ # .transform(Remi::Transform[:enforce_types].(on_error: :ignore))
45
+
46
+ map source(:student_id) .target(:student_id)
47
+ map source(:student_dob) .target(:student_dob)
48
+ end
49
+
50
+ IRuby.display work_df, type: 'text/html'
51
+ puts "work_df metadata: #{work_df.vector_metadata}"
52
+
53
+ puts "out_activity.fields metadata: #{out_activity.fields}"
54
+ puts "out_activity.df metadata: #{out_activity.df.vector_metadata}"
55
+ puts "work_df is a #{work_df.class}"
56
+ out_activity.df = work_df#[*out_activity.fields.keys]
57
+ puts "out_activity.fields metadata: #{out_activity.fields}"
58
+ puts "out_activity.df metadata: #{out_activity.df.vector_metadata}"
59
+ IRuby.display out_activity.df, type: 'text/html'
60
+
61
+
62
+ =end
metadata CHANGED
@@ -1,29 +1,15 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: remi
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.27
4
+ version: 0.2.28
5
5
  platform: ruby
6
6
  authors:
7
7
  - Sterling Paramore
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-04-13 00:00:00.000000000 Z
11
+ date: 2016-04-28 00:00:00.000000000 Z
12
12
  dependencies:
13
- - !ruby/object:Gem::Dependency
14
- name: daru
15
- requirement: !ruby/object:Gem::Requirement
16
- requirements:
17
- - - '='
18
- - !ruby/object:Gem::Version
19
- version: 0.1.2
20
- type: :runtime
21
- prerelease: false
22
- version_requirements: !ruby/object:Gem::Requirement
23
- requirements:
24
- - - '='
25
- - !ruby/object:Gem::Version
26
- version: 0.1.2
27
13
  - !ruby/object:Gem::Dependency
28
14
  name: bond
29
15
  requirement: !ruby/object:Gem::Requirement
@@ -200,6 +186,7 @@ files:
200
186
  - features/csv_file_target_job.feature
201
187
  - features/examples.feature
202
188
  - features/formulas.feature
189
+ - features/metadata.feature
203
190
  - features/parameters.feature
204
191
  - features/sample_job.feature
205
192
  - features/sftp_file_target_job.feature
@@ -215,6 +202,7 @@ files:
215
202
  - jobs/all_jobs_shared.rb
216
203
  - jobs/copy_source_job.rb
217
204
  - jobs/csv_file_target_job.rb
205
+ - jobs/metadata_job.rb
218
206
  - jobs/parameters_job.rb
219
207
  - jobs/sample_job.rb
220
208
  - jobs/sftp_file_target_job.rb
@@ -229,22 +217,18 @@ files:
229
217
  - lib/remi/cucumber.rb
230
218
  - lib/remi/cucumber/business_rules.rb
231
219
  - lib/remi/cucumber/data_source.rb
232
- - lib/remi/data_source.rb
233
- - lib/remi/data_source/csv_file.rb
234
- - lib/remi/data_source/data_frame.rb
235
- - lib/remi/data_source/postgres.rb
236
- - lib/remi/data_source/salesforce.rb
220
+ - lib/remi/data_frame.rb
221
+ - lib/remi/data_frame/daru.rb
237
222
  - lib/remi/data_subject.rb
238
- - lib/remi/data_target.rb
239
- - lib/remi/data_target/csv_file.rb
240
- - lib/remi/data_target/data_frame.rb
241
- - lib/remi/data_target/postgres.rb
242
- - lib/remi/data_target/salesforce.rb
243
- - lib/remi/data_target/sftp_file.rb
223
+ - lib/remi/data_subject/csv_file.rb
224
+ - lib/remi/data_subject/data_frame.rb
225
+ - lib/remi/data_subject/postgres.rb
226
+ - lib/remi/data_subject/salesforce.rb
227
+ - lib/remi/data_subject/sftp_file.rb
244
228
  - lib/remi/extractor/sftp_file.rb
245
229
  - lib/remi/field_symbolizers.rb
230
+ - lib/remi/fields.rb
246
231
  - lib/remi/job.rb
247
- - lib/remi/refinements/daru.rb
248
232
  - lib/remi/refinements/symbolizer.rb
249
233
  - lib/remi/settings.rb
250
234
  - lib/remi/sf_bulk_helper.rb
@@ -253,6 +237,7 @@ files:
253
237
  - lib/remi/version.rb
254
238
  - remi.gemspec
255
239
  - spec/extractor/sftp_file_spec.rb
240
+ - spec/metadata_spec.rb
256
241
  - spec/remi_spec.rb
257
242
  - workbooks/sample_workbook.ipynb
258
243
  - workbooks/workbook_helper.rb
@@ -285,6 +270,7 @@ test_files:
285
270
  - features/csv_file_target_job.feature
286
271
  - features/examples.feature
287
272
  - features/formulas.feature
273
+ - features/metadata.feature
288
274
  - features/parameters.feature
289
275
  - features/sample_job.feature
290
276
  - features/sftp_file_target_job.feature
@@ -297,4 +283,5 @@ test_files:
297
283
  - features/transforms/prefix.feature
298
284
  - features/transforms/truncate.feature
299
285
  - spec/extractor/sftp_file_spec.rb
286
+ - spec/metadata_spec.rb
300
287
  - spec/remi_spec.rb
@@ -1,13 +0,0 @@
1
- module Remi
2
- module DataSource
3
- include DataSubject
4
-
5
- def extract
6
- raise "Extract function undefined for #{self.class.name}"
7
- end
8
-
9
- def feild_symbolizer
10
- Remi::FieldSymbolizers[:standard]
11
- end
12
- end
13
- end
@@ -1,101 +0,0 @@
1
- module Remi
2
- module DataSource
3
- class CsvFile
4
- include DataSource
5
-
6
- using Remi::Refinements::Daru
7
-
8
- def self.default_csv_options
9
- CSV::DEFAULT_OPTIONS.merge({
10
- headers: true,
11
- header_converters: Remi::FieldSymbolizers[:standard],
12
- converters: [],
13
- col_sep: ',',
14
- encoding: 'UTF-8',
15
- quote_char: '"'
16
- })
17
- end
18
-
19
-
20
- def initialize(fields: {}, extractor:, csv_options: {}, filename_field: nil, logger: Remi::Settings.logger)
21
- @fields = fields
22
- self.extractor = extractor
23
- @csv_options = self.class.default_csv_options.merge(csv_options)
24
- @filename_field = filename_field
25
- @logger = logger
26
- end
27
-
28
- attr_accessor :fields
29
- attr_reader :extractor
30
- attr_reader :csv_options
31
-
32
- def field_symbolizer
33
- self.class.default_csv_options[:header_converters]
34
- end
35
-
36
- def extract
37
- @extracted = Array(@extractor.extract)
38
- end
39
-
40
- def extracted
41
- @extracted || extract
42
- end
43
-
44
- def extractor=(arg)
45
- case arg
46
- when Extractor::SftpFile, Extractor::LocalFile
47
- @extractor = arg
48
- when String
49
- @extractor = Extractor::LocalFile.new(path: arg)
50
- when Regexp
51
- raise "Adding regex matching to local files would be easy, not done yet"
52
- else
53
- raise "Unknown extractor of type #{arg.class}: #{arg}"
54
- end
55
- end
56
-
57
- # Only going to support single file for now
58
- def source_filename
59
- raise "Multiple source files detected" if extracted.size > 1
60
- @source_filename ||= extracted.first
61
- end
62
-
63
- def first_line
64
- # Readline assumes \n line endings. Strip out \r if it is a DOS file.
65
- @first_line ||= File.open(source_filename) do |f|
66
- f.readline.gsub(/\r/,'')
67
- end
68
- end
69
-
70
- def headers
71
- @headers ||= CSV.open(source_filename, 'r', source_csv_options) { |csv| csv.first }.headers
72
- end
73
-
74
- def valid_headers?
75
- (fields.keys - headers).empty?
76
- end
77
-
78
- def to_dataframe
79
- # Assumes that each file has exactly the same structure
80
- result_df = nil
81
- extracted.each_with_index do |filename, idx|
82
- @logger.info "Converting #{filename} to a dataframe"
83
- csv_df = Daru::DataFrame.from_csv filename, @csv_options
84
-
85
- csv_df[@filename_field] = Daru::Vector.new([filename] * csv_df.size, index: csv_df.index) if @filename_field
86
- if idx == 0
87
- result_df = csv_df
88
- else
89
- result_df = result_df.concat csv_df
90
- end
91
- end
92
-
93
- result_df
94
- end
95
-
96
- def df
97
- @dataframe ||= to_dataframe
98
- end
99
- end
100
- end
101
- end
@@ -1,16 +0,0 @@
1
- module Remi
2
- module DataSource
3
- class DataFrame
4
- include DataSubject
5
-
6
- def initialize(fields: {}, **args)
7
- @fields = fields
8
- end
9
-
10
- def df
11
- @dataframe ||= Daru::DataFrame.new([], order: @fields.keys)
12
- end
13
-
14
- end
15
- end
16
- end
@@ -1,58 +0,0 @@
1
- module Remi
2
- module DataSource
3
- class Postgres
4
- include DataSource
5
-
6
- def initialize(fields: {}, credentials:, query:, logger: Remi::Settings.logger)
7
- @fields = fields
8
- @credentials = credentials
9
- @query = query
10
- @logger = logger
11
- end
12
-
13
- attr_accessor :fields
14
-
15
- def extract
16
- @logger.info "Executing query #{@query}"
17
- @raw_result = connection.exec @query
18
- end
19
-
20
- def raw_result
21
- @raw_result ||= extract
22
- end
23
-
24
- def connection
25
- @connection ||= PG.connect(
26
- host: @credentials[:host] || 'localhost',
27
- port: @credentials[:port] || 5432,
28
- dbname: @credentials[:dbname],
29
- user: @credentials[:user] || `whoami`.chomp,
30
- password: @credentials[:password],
31
- sslmode: @credentials[:sslmode] || 'allow'
32
- )
33
- end
34
-
35
-
36
- def to_dataframe
37
- # Performance for larger sets could be improved by using bulk query (via COPY)
38
- @logger.info "Converting query to a dataframe"
39
-
40
- hash_array = {}
41
- raw_result.each do |row|
42
- row.each do |field, value|
43
- (hash_array[field_symbolizer.call(field)] ||= []) << value
44
- end
45
- end
46
-
47
- # After converting to DF, clear the PG results to save memory.
48
- raw_result.clear
49
-
50
- Daru::DataFrame.new hash_array, order: hash_array.keys
51
- end
52
-
53
- def df
54
- @dataframe ||= to_dataframe
55
- end
56
- end
57
- end
58
- end
@@ -1,87 +0,0 @@
1
- module Remi
2
- module DataSource
3
- class Salesforce
4
- include DataSource
5
-
6
- def initialize(fields: {}, object:, query:, credentials:, api: :bulk, logger: Remi::Settings.logger)
7
- @fields = fields
8
- @sfo = object
9
- @query = query
10
- @credentials = credentials
11
- @api = api
12
- @logger = logger
13
- end
14
-
15
- attr_accessor :fields
16
- attr_accessor :raw_result
17
-
18
- def field_symbolizer
19
- Remi::FieldSymbolizers[:salesforce]
20
- end
21
-
22
- def extract
23
- @raw_result = sf_bulk.query(@sfo, @query, 10000)
24
-
25
- check_for_errors(@raw_result)
26
- @raw_result
27
- end
28
-
29
- def check_for_errors(sf_result)
30
- sf_result['batches'].each do |batch|
31
- raise "Error with batch #{batch['id']} - #{batch['state']}: #{batch['stateMessage']}" unless batch['state'].first == 'Completed'
32
- end
33
- end
34
-
35
- def raw_result
36
- @raw_result ||= extract
37
- end
38
-
39
-
40
-
41
- def restforce_client
42
- @restforce_client ||= begin
43
- client = Restforce.new(@credentials)
44
-
45
- #run a dummy query to initiate a connection. Workaround for Bulk API problem
46
- # https://github.com/yatish27/salesforce_bulk_api/issues/33
47
- client.query('SELECT Id FROM Contact LIMIT 1')
48
- client
49
- end
50
- end
51
-
52
- def sf_bulk
53
- @sf_bulk ||= SalesforceBulkApi::Api.new(restforce_client).tap { |o| o.connection.set_status_throttle(5) }
54
- end
55
-
56
- def to_dataframe
57
- @logger.info "Converting salesforce query results to a dataframe"
58
-
59
- hash_array = {}
60
- raw_result['batches'].each do |batch|
61
- next unless batch['response']
62
-
63
- batch['response'].each do |record|
64
- record.each do |field, value|
65
- next if ['xsi:type','type'].include? field
66
- (hash_array[field.to_sym] ||= []) << case value.first
67
- when Hash
68
- value.first["xsi:nil"] == "true" ? nil : value.first
69
- else
70
- value.first
71
- end
72
- end
73
- end
74
-
75
- # delete raw result at end of processing to free memory
76
- batch['response'] = nil
77
- end
78
-
79
- Daru::DataFrame.new hash_array, order: hash_array.keys
80
- end
81
-
82
- def df
83
- @dataframe ||= to_dataframe
84
- end
85
- end
86
- end
87
- end