remi 0.2.27 → 0.2.28

Sign up to get free protection for your applications and to get access to all the features.
Files changed (49) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +3 -0
  3. data/Gemfile.lock +34 -5
  4. data/features/metadata.feature +17 -0
  5. data/features/step_definitions/remi_step.rb +6 -6
  6. data/features/transforms/date_diff.feature +1 -0
  7. data/jobs/aggregate_job.rb +0 -1
  8. data/jobs/all_jobs_shared.rb +0 -2
  9. data/jobs/copy_source_job.rb +0 -1
  10. data/jobs/csv_file_target_job.rb +0 -1
  11. data/jobs/metadata_job.rb +60 -0
  12. data/jobs/parameters_job.rb +1 -1
  13. data/jobs/sample_job.rb +19 -20
  14. data/jobs/sftp_file_target_job.rb +0 -1
  15. data/jobs/transforms/date_diff_job.rb +1 -1
  16. data/jobs/transforms/nvl_job.rb +1 -1
  17. data/jobs/transforms/parse_date_job.rb +7 -4
  18. data/jobs/transforms/prefix_job.rb +1 -1
  19. data/jobs/transforms/truncate_job.rb +1 -1
  20. data/lib/remi.rb +10 -15
  21. data/lib/remi/cucumber/business_rules.rb +23 -23
  22. data/lib/remi/cucumber/data_source.rb +2 -1
  23. data/lib/remi/data_frame.rb +36 -0
  24. data/lib/remi/data_frame/daru.rb +67 -0
  25. data/lib/remi/data_subject.rb +71 -10
  26. data/lib/remi/data_subject/csv_file.rb +151 -0
  27. data/lib/remi/data_subject/data_frame.rb +53 -0
  28. data/lib/remi/data_subject/postgres.rb +136 -0
  29. data/lib/remi/data_subject/salesforce.rb +136 -0
  30. data/lib/remi/data_subject/sftp_file.rb +66 -0
  31. data/lib/remi/fields.rb +8 -0
  32. data/lib/remi/source_to_target_map.rb +56 -32
  33. data/lib/remi/transform.rb +426 -83
  34. data/lib/remi/version.rb +1 -1
  35. data/remi.gemspec +2 -1
  36. data/spec/metadata_spec.rb +62 -0
  37. metadata +15 -28
  38. data/lib/remi/data_source.rb +0 -13
  39. data/lib/remi/data_source/csv_file.rb +0 -101
  40. data/lib/remi/data_source/data_frame.rb +0 -16
  41. data/lib/remi/data_source/postgres.rb +0 -58
  42. data/lib/remi/data_source/salesforce.rb +0 -87
  43. data/lib/remi/data_target.rb +0 -15
  44. data/lib/remi/data_target/csv_file.rb +0 -42
  45. data/lib/remi/data_target/data_frame.rb +0 -14
  46. data/lib/remi/data_target/postgres.rb +0 -74
  47. data/lib/remi/data_target/salesforce.rb +0 -54
  48. data/lib/remi/data_target/sftp_file.rb +0 -54
  49. data/lib/remi/refinements/daru.rb +0 -85
@@ -1,3 +1,3 @@
1
1
  module Remi
2
- VERSION = '0.2.27'
2
+ VERSION = '0.2.28'
3
3
  end
@@ -13,7 +13,8 @@ Gem::Specification.new do |s|
13
13
  s.description = "Data manipulation and ETL in Ruby"
14
14
 
15
15
  s.rubyforge_project = "Remi"
16
- s.add_runtime_dependency "daru", ["0.1.2"]
16
+ # Making use of a fork for now
17
+ # s.add_runtime_dependency "daru", ["0.1.2"]
17
18
 
18
19
  s.add_runtime_dependency 'bond', ['~> 0.5']
19
20
  s.add_runtime_dependency 'docile', ['~> 1.1']
@@ -0,0 +1,62 @@
1
+ =begin
2
+
3
+ this is probably mostly about data subjects
4
+
5
+
6
+ calling fields on the data subject should return only the fields defined, even if there
7
+ are additional fields on the dataframe
8
+
9
+ dataframe metadata is merged into field metadata, always with a preference for field metadata
10
+
11
+ metadata propagates through 1:1 STTMs
12
+
13
+ metadata propagates through intermediate dataframes that are not data subjects
14
+
15
+
16
+ puts '---- ROUTE 1 - direct -----'
17
+ out_activity.df = Remi::DataFrame.create(:daru, [], order: out_activity.fields.keys)#, index: in_activity.df.index)
18
+ Remi::SourceToTargetMap.apply(in_activity.df, out_activity.df) do
19
+ # map source(:activity_id, :student_id) .target(:activity_id, :student_id)
20
+ map source(:activity_id) .target(:activity_id)
21
+ .transform(->(v) { "-#{v}-" })
22
+ # enforce types needs to be based on the "fields" for the target
23
+ # I might have to convert any Daru Dataframe to Remi dataframes in the STTM
24
+
25
+ # .transform(Remi::Transform[:enforce_types].(on_error: :ignore))
26
+
27
+ map source(:student_id) .target(:student_id)
28
+ map source(:student_dob) .target(:student_dob)
29
+ end
30
+
31
+
32
+ puts "out_activity.fields: #{out_activity.fields}"
33
+ puts "out_activity.df metadata: #{out_activity.df.vector_metadata}"
34
+ out_activity.df = out_activity.df[*(out_activity.fields.keys)]
35
+ puts "out_activity.fields: #{out_activity.fields}"
36
+ puts "out_activity.df metadata: #{out_activity.df.vector_metadata}"
37
+ IRuby.display out_activity.df, type: 'text/html'
38
+
39
+
40
+ puts '---- ROUTE 2 - via work_df -----'
41
+ work_df = Remi::DataFrame.create(:daru, [], order: out_activity.fields.keys)#, index: in_activity.df.index)
42
+ Remi::SourceToTargetMap.apply(in_activity.df, work_df) do
43
+ map source(:activity_id) .target(:activity_id)
44
+ # .transform(Remi::Transform[:enforce_types].(on_error: :ignore))
45
+
46
+ map source(:student_id) .target(:student_id)
47
+ map source(:student_dob) .target(:student_dob)
48
+ end
49
+
50
+ IRuby.display work_df, type: 'text/html'
51
+ puts "work_df metadata: #{work_df.vector_metadata}"
52
+
53
+ puts "out_activity.fields metadata: #{out_activity.fields}"
54
+ puts "out_activity.df metadata: #{out_activity.df.vector_metadata}"
55
+ puts "work_df is a #{work_df.class}"
56
+ out_activity.df = work_df#[*out_activity.fields.keys]
57
+ puts "out_activity.fields metadata: #{out_activity.fields}"
58
+ puts "out_activity.df metadata: #{out_activity.df.vector_metadata}"
59
+ IRuby.display out_activity.df, type: 'text/html'
60
+
61
+
62
+ =end
metadata CHANGED
@@ -1,29 +1,15 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: remi
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.27
4
+ version: 0.2.28
5
5
  platform: ruby
6
6
  authors:
7
7
  - Sterling Paramore
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-04-13 00:00:00.000000000 Z
11
+ date: 2016-04-28 00:00:00.000000000 Z
12
12
  dependencies:
13
- - !ruby/object:Gem::Dependency
14
- name: daru
15
- requirement: !ruby/object:Gem::Requirement
16
- requirements:
17
- - - '='
18
- - !ruby/object:Gem::Version
19
- version: 0.1.2
20
- type: :runtime
21
- prerelease: false
22
- version_requirements: !ruby/object:Gem::Requirement
23
- requirements:
24
- - - '='
25
- - !ruby/object:Gem::Version
26
- version: 0.1.2
27
13
  - !ruby/object:Gem::Dependency
28
14
  name: bond
29
15
  requirement: !ruby/object:Gem::Requirement
@@ -200,6 +186,7 @@ files:
200
186
  - features/csv_file_target_job.feature
201
187
  - features/examples.feature
202
188
  - features/formulas.feature
189
+ - features/metadata.feature
203
190
  - features/parameters.feature
204
191
  - features/sample_job.feature
205
192
  - features/sftp_file_target_job.feature
@@ -215,6 +202,7 @@ files:
215
202
  - jobs/all_jobs_shared.rb
216
203
  - jobs/copy_source_job.rb
217
204
  - jobs/csv_file_target_job.rb
205
+ - jobs/metadata_job.rb
218
206
  - jobs/parameters_job.rb
219
207
  - jobs/sample_job.rb
220
208
  - jobs/sftp_file_target_job.rb
@@ -229,22 +217,18 @@ files:
229
217
  - lib/remi/cucumber.rb
230
218
  - lib/remi/cucumber/business_rules.rb
231
219
  - lib/remi/cucumber/data_source.rb
232
- - lib/remi/data_source.rb
233
- - lib/remi/data_source/csv_file.rb
234
- - lib/remi/data_source/data_frame.rb
235
- - lib/remi/data_source/postgres.rb
236
- - lib/remi/data_source/salesforce.rb
220
+ - lib/remi/data_frame.rb
221
+ - lib/remi/data_frame/daru.rb
237
222
  - lib/remi/data_subject.rb
238
- - lib/remi/data_target.rb
239
- - lib/remi/data_target/csv_file.rb
240
- - lib/remi/data_target/data_frame.rb
241
- - lib/remi/data_target/postgres.rb
242
- - lib/remi/data_target/salesforce.rb
243
- - lib/remi/data_target/sftp_file.rb
223
+ - lib/remi/data_subject/csv_file.rb
224
+ - lib/remi/data_subject/data_frame.rb
225
+ - lib/remi/data_subject/postgres.rb
226
+ - lib/remi/data_subject/salesforce.rb
227
+ - lib/remi/data_subject/sftp_file.rb
244
228
  - lib/remi/extractor/sftp_file.rb
245
229
  - lib/remi/field_symbolizers.rb
230
+ - lib/remi/fields.rb
246
231
  - lib/remi/job.rb
247
- - lib/remi/refinements/daru.rb
248
232
  - lib/remi/refinements/symbolizer.rb
249
233
  - lib/remi/settings.rb
250
234
  - lib/remi/sf_bulk_helper.rb
@@ -253,6 +237,7 @@ files:
253
237
  - lib/remi/version.rb
254
238
  - remi.gemspec
255
239
  - spec/extractor/sftp_file_spec.rb
240
+ - spec/metadata_spec.rb
256
241
  - spec/remi_spec.rb
257
242
  - workbooks/sample_workbook.ipynb
258
243
  - workbooks/workbook_helper.rb
@@ -285,6 +270,7 @@ test_files:
285
270
  - features/csv_file_target_job.feature
286
271
  - features/examples.feature
287
272
  - features/formulas.feature
273
+ - features/metadata.feature
288
274
  - features/parameters.feature
289
275
  - features/sample_job.feature
290
276
  - features/sftp_file_target_job.feature
@@ -297,4 +283,5 @@ test_files:
297
283
  - features/transforms/prefix.feature
298
284
  - features/transforms/truncate.feature
299
285
  - spec/extractor/sftp_file_spec.rb
286
+ - spec/metadata_spec.rb
300
287
  - spec/remi_spec.rb
@@ -1,13 +0,0 @@
1
- module Remi
2
- module DataSource
3
- include DataSubject
4
-
5
- def extract
6
- raise "Extract function undefined for #{self.class.name}"
7
- end
8
-
9
- def feild_symbolizer
10
- Remi::FieldSymbolizers[:standard]
11
- end
12
- end
13
- end
@@ -1,101 +0,0 @@
1
- module Remi
2
- module DataSource
3
- class CsvFile
4
- include DataSource
5
-
6
- using Remi::Refinements::Daru
7
-
8
- def self.default_csv_options
9
- CSV::DEFAULT_OPTIONS.merge({
10
- headers: true,
11
- header_converters: Remi::FieldSymbolizers[:standard],
12
- converters: [],
13
- col_sep: ',',
14
- encoding: 'UTF-8',
15
- quote_char: '"'
16
- })
17
- end
18
-
19
-
20
- def initialize(fields: {}, extractor:, csv_options: {}, filename_field: nil, logger: Remi::Settings.logger)
21
- @fields = fields
22
- self.extractor = extractor
23
- @csv_options = self.class.default_csv_options.merge(csv_options)
24
- @filename_field = filename_field
25
- @logger = logger
26
- end
27
-
28
- attr_accessor :fields
29
- attr_reader :extractor
30
- attr_reader :csv_options
31
-
32
- def field_symbolizer
33
- self.class.default_csv_options[:header_converters]
34
- end
35
-
36
- def extract
37
- @extracted = Array(@extractor.extract)
38
- end
39
-
40
- def extracted
41
- @extracted || extract
42
- end
43
-
44
- def extractor=(arg)
45
- case arg
46
- when Extractor::SftpFile, Extractor::LocalFile
47
- @extractor = arg
48
- when String
49
- @extractor = Extractor::LocalFile.new(path: arg)
50
- when Regexp
51
- raise "Adding regex matching to local files would be easy, not done yet"
52
- else
53
- raise "Unknown extractor of type #{arg.class}: #{arg}"
54
- end
55
- end
56
-
57
- # Only going to support single file for now
58
- def source_filename
59
- raise "Multiple source files detected" if extracted.size > 1
60
- @source_filename ||= extracted.first
61
- end
62
-
63
- def first_line
64
- # Readline assumes \n line endings. Strip out \r if it is a DOS file.
65
- @first_line ||= File.open(source_filename) do |f|
66
- f.readline.gsub(/\r/,'')
67
- end
68
- end
69
-
70
- def headers
71
- @headers ||= CSV.open(source_filename, 'r', source_csv_options) { |csv| csv.first }.headers
72
- end
73
-
74
- def valid_headers?
75
- (fields.keys - headers).empty?
76
- end
77
-
78
- def to_dataframe
79
- # Assumes that each file has exactly the same structure
80
- result_df = nil
81
- extracted.each_with_index do |filename, idx|
82
- @logger.info "Converting #{filename} to a dataframe"
83
- csv_df = Daru::DataFrame.from_csv filename, @csv_options
84
-
85
- csv_df[@filename_field] = Daru::Vector.new([filename] * csv_df.size, index: csv_df.index) if @filename_field
86
- if idx == 0
87
- result_df = csv_df
88
- else
89
- result_df = result_df.concat csv_df
90
- end
91
- end
92
-
93
- result_df
94
- end
95
-
96
- def df
97
- @dataframe ||= to_dataframe
98
- end
99
- end
100
- end
101
- end
@@ -1,16 +0,0 @@
1
- module Remi
2
- module DataSource
3
- class DataFrame
4
- include DataSubject
5
-
6
- def initialize(fields: {}, **args)
7
- @fields = fields
8
- end
9
-
10
- def df
11
- @dataframe ||= Daru::DataFrame.new([], order: @fields.keys)
12
- end
13
-
14
- end
15
- end
16
- end
@@ -1,58 +0,0 @@
1
- module Remi
2
- module DataSource
3
- class Postgres
4
- include DataSource
5
-
6
- def initialize(fields: {}, credentials:, query:, logger: Remi::Settings.logger)
7
- @fields = fields
8
- @credentials = credentials
9
- @query = query
10
- @logger = logger
11
- end
12
-
13
- attr_accessor :fields
14
-
15
- def extract
16
- @logger.info "Executing query #{@query}"
17
- @raw_result = connection.exec @query
18
- end
19
-
20
- def raw_result
21
- @raw_result ||= extract
22
- end
23
-
24
- def connection
25
- @connection ||= PG.connect(
26
- host: @credentials[:host] || 'localhost',
27
- port: @credentials[:port] || 5432,
28
- dbname: @credentials[:dbname],
29
- user: @credentials[:user] || `whoami`.chomp,
30
- password: @credentials[:password],
31
- sslmode: @credentials[:sslmode] || 'allow'
32
- )
33
- end
34
-
35
-
36
- def to_dataframe
37
- # Performance for larger sets could be improved by using bulk query (via COPY)
38
- @logger.info "Converting query to a dataframe"
39
-
40
- hash_array = {}
41
- raw_result.each do |row|
42
- row.each do |field, value|
43
- (hash_array[field_symbolizer.call(field)] ||= []) << value
44
- end
45
- end
46
-
47
- # After converting to DF, clear the PG results to save memory.
48
- raw_result.clear
49
-
50
- Daru::DataFrame.new hash_array, order: hash_array.keys
51
- end
52
-
53
- def df
54
- @dataframe ||= to_dataframe
55
- end
56
- end
57
- end
58
- end
@@ -1,87 +0,0 @@
1
- module Remi
2
- module DataSource
3
- class Salesforce
4
- include DataSource
5
-
6
- def initialize(fields: {}, object:, query:, credentials:, api: :bulk, logger: Remi::Settings.logger)
7
- @fields = fields
8
- @sfo = object
9
- @query = query
10
- @credentials = credentials
11
- @api = api
12
- @logger = logger
13
- end
14
-
15
- attr_accessor :fields
16
- attr_accessor :raw_result
17
-
18
- def field_symbolizer
19
- Remi::FieldSymbolizers[:salesforce]
20
- end
21
-
22
- def extract
23
- @raw_result = sf_bulk.query(@sfo, @query, 10000)
24
-
25
- check_for_errors(@raw_result)
26
- @raw_result
27
- end
28
-
29
- def check_for_errors(sf_result)
30
- sf_result['batches'].each do |batch|
31
- raise "Error with batch #{batch['id']} - #{batch['state']}: #{batch['stateMessage']}" unless batch['state'].first == 'Completed'
32
- end
33
- end
34
-
35
- def raw_result
36
- @raw_result ||= extract
37
- end
38
-
39
-
40
-
41
- def restforce_client
42
- @restforce_client ||= begin
43
- client = Restforce.new(@credentials)
44
-
45
- #run a dummy query to initiate a connection. Workaround for Bulk API problem
46
- # https://github.com/yatish27/salesforce_bulk_api/issues/33
47
- client.query('SELECT Id FROM Contact LIMIT 1')
48
- client
49
- end
50
- end
51
-
52
- def sf_bulk
53
- @sf_bulk ||= SalesforceBulkApi::Api.new(restforce_client).tap { |o| o.connection.set_status_throttle(5) }
54
- end
55
-
56
- def to_dataframe
57
- @logger.info "Converting salesforce query results to a dataframe"
58
-
59
- hash_array = {}
60
- raw_result['batches'].each do |batch|
61
- next unless batch['response']
62
-
63
- batch['response'].each do |record|
64
- record.each do |field, value|
65
- next if ['xsi:type','type'].include? field
66
- (hash_array[field.to_sym] ||= []) << case value.first
67
- when Hash
68
- value.first["xsi:nil"] == "true" ? nil : value.first
69
- else
70
- value.first
71
- end
72
- end
73
- end
74
-
75
- # delete raw result at end of processing to free memory
76
- batch['response'] = nil
77
- end
78
-
79
- Daru::DataFrame.new hash_array, order: hash_array.keys
80
- end
81
-
82
- def df
83
- @dataframe ||= to_dataframe
84
- end
85
- end
86
- end
87
- end