remi 0.2.27 → 0.2.28

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +3 -0
  3. data/Gemfile.lock +34 -5
  4. data/features/metadata.feature +17 -0
  5. data/features/step_definitions/remi_step.rb +6 -6
  6. data/features/transforms/date_diff.feature +1 -0
  7. data/jobs/aggregate_job.rb +0 -1
  8. data/jobs/all_jobs_shared.rb +0 -2
  9. data/jobs/copy_source_job.rb +0 -1
  10. data/jobs/csv_file_target_job.rb +0 -1
  11. data/jobs/metadata_job.rb +60 -0
  12. data/jobs/parameters_job.rb +1 -1
  13. data/jobs/sample_job.rb +19 -20
  14. data/jobs/sftp_file_target_job.rb +0 -1
  15. data/jobs/transforms/date_diff_job.rb +1 -1
  16. data/jobs/transforms/nvl_job.rb +1 -1
  17. data/jobs/transforms/parse_date_job.rb +7 -4
  18. data/jobs/transforms/prefix_job.rb +1 -1
  19. data/jobs/transforms/truncate_job.rb +1 -1
  20. data/lib/remi.rb +10 -15
  21. data/lib/remi/cucumber/business_rules.rb +23 -23
  22. data/lib/remi/cucumber/data_source.rb +2 -1
  23. data/lib/remi/data_frame.rb +36 -0
  24. data/lib/remi/data_frame/daru.rb +67 -0
  25. data/lib/remi/data_subject.rb +71 -10
  26. data/lib/remi/data_subject/csv_file.rb +151 -0
  27. data/lib/remi/data_subject/data_frame.rb +53 -0
  28. data/lib/remi/data_subject/postgres.rb +136 -0
  29. data/lib/remi/data_subject/salesforce.rb +136 -0
  30. data/lib/remi/data_subject/sftp_file.rb +66 -0
  31. data/lib/remi/fields.rb +8 -0
  32. data/lib/remi/source_to_target_map.rb +56 -32
  33. data/lib/remi/transform.rb +426 -83
  34. data/lib/remi/version.rb +1 -1
  35. data/remi.gemspec +2 -1
  36. data/spec/metadata_spec.rb +62 -0
  37. metadata +15 -28
  38. data/lib/remi/data_source.rb +0 -13
  39. data/lib/remi/data_source/csv_file.rb +0 -101
  40. data/lib/remi/data_source/data_frame.rb +0 -16
  41. data/lib/remi/data_source/postgres.rb +0 -58
  42. data/lib/remi/data_source/salesforce.rb +0 -87
  43. data/lib/remi/data_target.rb +0 -15
  44. data/lib/remi/data_target/csv_file.rb +0 -42
  45. data/lib/remi/data_target/data_frame.rb +0 -14
  46. data/lib/remi/data_target/postgres.rb +0 -74
  47. data/lib/remi/data_target/salesforce.rb +0 -54
  48. data/lib/remi/data_target/sftp_file.rb +0 -54
  49. data/lib/remi/refinements/daru.rb +0 -85
@@ -1,15 +0,0 @@
1
- module Remi
2
- module DataTarget
3
- include DataSubject
4
-
5
- # Gets called automatically at the end of a job, but could
6
- # also get manually called at the end of a transform so make
7
- # sure it doesn't do it twice.
8
- def load
9
- @logger.info "Loading target"
10
- return true if @loaded
11
- @loaded = true
12
- raise "Load function undefined for #{self.class.name}"
13
- end
14
- end
15
- end
@@ -1,42 +0,0 @@
1
- module Remi
2
- module DataTarget
3
- class CsvFile
4
- include DataTarget
5
-
6
- def self.default_csv_options
7
- CSV::DEFAULT_OPTIONS.merge({
8
- headers: true,
9
- header_converters: Remi::FieldSymbolizers[:standard],
10
- col_sep: ',',
11
- encoding: 'UTF-8',
12
- quote_char: '"',
13
- row_sep: "\n"
14
- })
15
- end
16
-
17
- def initialize(path:, csv_options: {}, logger: Remi::Settings.logger)
18
- @path = path
19
- @csv_options = self.class.default_csv_options.merge(csv_options)
20
- @logger = logger
21
- end
22
-
23
- attr_reader :path
24
- attr_reader :csv_options
25
-
26
- def field_symbolizer
27
- self.class.default_csv_options[:header_converters]
28
- end
29
-
30
- def load
31
- return true if @loaded || df.size == 0
32
-
33
- @logger.info "Writing CSV file #{@path}"
34
-
35
- df.write_csv @path, @csv_options
36
-
37
- @loaded = true
38
- end
39
-
40
- end
41
- end
42
- end
@@ -1,14 +0,0 @@
1
- module Remi
2
- module DataTarget
3
- class DataFrame
4
- include DataSubject
5
-
6
- def initialize(**args)
7
- end
8
-
9
- def load
10
- true
11
- end
12
- end
13
- end
14
- end
@@ -1,74 +0,0 @@
1
- module Remi
2
- module DataTarget
3
- class Postgres
4
- include DataTarget
5
-
6
- def initialize(credentials:, table_name:, fields:, logger: Remi::Settings.logger)
7
- @credentials = credentials
8
- @table_name = table_name
9
- @fields = fields
10
- @logger = logger
11
- end
12
-
13
- def load
14
- return true if @loaded || df.size == 0
15
-
16
- @logger.info "Performing postgres load to table #{@table_name}"
17
- create_target_table
18
- load_target_table
19
-
20
- @loaded = true
21
- end
22
-
23
-
24
- def connection
25
- @connection ||= PG.connect(
26
- host: @credentials[:host] || 'localhost',
27
- port: @credentials[:port] || 5432,
28
- dbname: @credentials[:dbname],
29
- user: @credentials[:user] || `whoami`.chomp,
30
- password: @credentials[:password],
31
- sslmode: @credentials[:sslmode] || 'allow'
32
- )
33
- end
34
-
35
-
36
- def fields_with_type_ddl
37
- @fields.map { |k,v| "#{k} #{v[:type]}" }.join(', ')
38
- end
39
-
40
- def create_target_table
41
- connection.exec <<-EOT
42
- CREATE TEMPORARY TABLE #{@table_name} (
43
- #{fields_with_type_ddl}
44
- )
45
- EOT
46
- end
47
-
48
- def load_target_table
49
- connection.copy_data "COPY #{@table_name} (#{@fields.keys.join(', ')}) FROM STDIN" do
50
- df.each(:row) do |row|
51
- row_str = @fields.keys.map do |field|
52
- field = row[field]
53
- case
54
- when field.respond_to?(:strftime)
55
- field.strftime('%Y-%m-%d %H:%M:%S')
56
- when field.respond_to?(:map)
57
- field.to_json.gsub("\t", '\t')
58
- when field.blank? && !field.nil?
59
- ''
60
- when field.nil?
61
- '\N'
62
- else
63
- field.to_s.gsub("\t", '\t')
64
- end
65
- end.join("\t")
66
-
67
- connection.put_copy_data row_str + "\n"
68
- end
69
- end
70
- end
71
-
72
- end
73
- end
74
- end
@@ -1,54 +0,0 @@
1
- module Remi
2
- module DataTarget
3
- class Salesforce
4
- include DataTarget
5
-
6
- def initialize(object:, operation:, credentials:, external_id: 'Id', api: :bulk, logger: Remi::Settings.logger)
7
- @sfo = object
8
- @operation = operation
9
- @external_id = external_id
10
- @credentials = credentials
11
- @api = api
12
- @logger = logger
13
- end
14
-
15
- def field_symbolizer
16
- Remi::FieldSymbolizers[:salesforce]
17
- end
18
-
19
- def load
20
- return true if @loaded || df.size == 0
21
-
22
- @logger.info "Performing Salesforce #{@operation} on object #{@sfo}"
23
-
24
- if @operation == :update
25
- Remi::SfBulkHelper::SfBulkUpdate.update(restforce_client, @sfo, df_as_array_of_hashes, logger: @logger)
26
- elsif @operation == :create
27
- Remi::SfBulkHelper::SfBulkCreate.create(restforce_client, @sfo, df_as_array_of_hashes, logger: @logger)
28
- elsif @operation == :upsert
29
- Remi::SfBulkHelper::SfBulkUpsert.upsert(restforce_client, @sfo, df_as_array_of_hashes, external_id: @external_id, logger: @logger)
30
- else
31
- raise "Unknown operation: #{@operation}"
32
- end
33
-
34
- @loaded = true
35
- end
36
-
37
- def restforce_client
38
- @restforce_client ||= begin
39
- client = Restforce.new(@credentials)
40
-
41
- #run a dummy query to initiate a connection. Workaround for Bulk API problem
42
- # https://github.com/yatish27/salesforce_bulk_api/issues/33
43
- client.query('SELECT Id FROM Contact LIMIT 1')
44
- client
45
- end
46
- end
47
-
48
- def df_as_array_of_hashes
49
- df.to_a[0]
50
- end
51
-
52
- end
53
- end
54
- end
@@ -1,54 +0,0 @@
1
- module Remi
2
- module DataTarget
3
- class SftpFile
4
- include DataTarget
5
-
6
- def initialize(credentials:, local_path:, remote_path: File.basename(local_path), logger: Remi::Settings.logger)
7
- @credentials = credentials
8
- @local_path = local_path
9
- @remote_path = remote_path
10
- @logger = logger
11
- end
12
-
13
- attr_reader :local_path
14
- attr_reader :remote_path
15
-
16
- def load
17
- return true if @loaded
18
-
19
- connection do |sftp|
20
- retry_upload { sftp.upload! @local_path, @remote_path }
21
- end
22
-
23
- @loaded = true
24
- end
25
-
26
-
27
-
28
- private
29
-
30
- def connection(&block)
31
- result = nil
32
- Net::SFTP.start(@credentials[:host], @credentials[:username], password: @credentials[:password], port: @credentials[:port] || '22') do |sftp|
33
- result = yield sftp
34
- end
35
- result
36
- end
37
-
38
- def retry_upload(ntry=2, &block)
39
- 1.upto(ntry).each do |itry|
40
- begin
41
- block.call
42
- rescue RuntimeError => err
43
- raise err unless itry < ntry
44
- @logger.error "Upload failed with error: #{err.message}"
45
- @logger.error "Retry attempt #{itry}/#{ntry-1}"
46
- sleep(1)
47
- end
48
- end
49
- end
50
-
51
-
52
- end
53
- end
54
- end
@@ -1,85 +0,0 @@
1
- module Remi
2
- module Refinements
3
- module Daru
4
- refine ::Daru::DataFrame do
5
-
6
- # Public: Fixes dup issues in the Daru library (vectors not being duped).
7
- def dup
8
- dupdf = ::Daru::DataFrame.new([], index: self.index)
9
- self.vectors.each do |v|
10
- dupdf[v] = self[v]
11
- end
12
-
13
- dupdf
14
- end
15
-
16
- # Public: Allows for combining dataframes with different columns
17
- def concat other_df
18
- vectors = @vectors.to_a
19
- data = []
20
-
21
- vectors.each do |v|
22
- other_vec = other_df.vectors.include?(v) ? other_df[v].to_a : [nil] * other_df.size
23
- data << self[v].dup.to_a.concat(other_vec)
24
- end
25
-
26
- other_df.vectors.each do |v|
27
- next if vectors.include?(v)
28
- vectors << v
29
- data << ([nil] * self.size).concat(other_df[v].to_a)
30
- end
31
-
32
- ::Daru::DataFrame.new(data, order: vectors)
33
- end
34
-
35
- # Public: Saves a Dataframe to a file.
36
- def hash_dump(filename)
37
- File.binwrite(filename, Marshal.dump(self.to_hash))
38
- end
39
-
40
- # Public: Allows the user to define an arbitrary aggregation function.
41
- #
42
- # by - The name of the DataFrame vector to use to group records.
43
- # func - A lambda function that accepts three arguments - the
44
- # first argument is the DataFrame, the second is the
45
- # key to the current group, and the third is the index
46
- # of the elements belonging to a group.
47
- #
48
- # Example:
49
- # df = Daru::DataFrame.new( { a: ['a','a','a','b','b'], year: ['2018','2015','2019', '2014', '2013'] })
50
- #
51
- # mymin = lambda do |field, df, group_key, indices|
52
- # values = indices.map { |idx| df.row[idx][field] }
53
- # "Group #{group_key} has a minimum value of #{values.min}"
54
- # end
55
- #
56
- # df.aggregate(by: :a, func: mymin.curry.(:year))
57
- #
58
- #
59
- # Returns a Daru::Vector.
60
- def aggregate(by:, func:)
61
- grouped = self.group_by(by)
62
- df_indices = self.index.to_a
63
- ::Daru::Vector.new(
64
- grouped.groups.reduce({}) do |h, (key, indices)|
65
- # Daru groups don't use the index of the dataframe when returning groups (WTF?).
66
- # Instead they return the position of the record in the dataframe. Here, we
67
- group_df_indices = indices.map { |v| df_indices[v] }
68
- group_key = key.size == 1 ? key.first : key
69
- h[group_key] = func.(self, group_key, group_df_indices)
70
- h
71
- end
72
- )
73
- end
74
-
75
- end
76
-
77
- refine ::Daru::DataFrame.singleton_class do
78
- # Public: Creates a DataFrame by reading the dumped version from a file.
79
- def from_hash_dump(filename)
80
- ::Daru::DataFrame.new(Marshal.load(File.binread(filename)))
81
- end
82
- end
83
- end
84
- end
85
- end