remi 0.2.27 → 0.2.28

Sign up to get free protection for your applications and to get access to all the features.
Files changed (49) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +3 -0
  3. data/Gemfile.lock +34 -5
  4. data/features/metadata.feature +17 -0
  5. data/features/step_definitions/remi_step.rb +6 -6
  6. data/features/transforms/date_diff.feature +1 -0
  7. data/jobs/aggregate_job.rb +0 -1
  8. data/jobs/all_jobs_shared.rb +0 -2
  9. data/jobs/copy_source_job.rb +0 -1
  10. data/jobs/csv_file_target_job.rb +0 -1
  11. data/jobs/metadata_job.rb +60 -0
  12. data/jobs/parameters_job.rb +1 -1
  13. data/jobs/sample_job.rb +19 -20
  14. data/jobs/sftp_file_target_job.rb +0 -1
  15. data/jobs/transforms/date_diff_job.rb +1 -1
  16. data/jobs/transforms/nvl_job.rb +1 -1
  17. data/jobs/transforms/parse_date_job.rb +7 -4
  18. data/jobs/transforms/prefix_job.rb +1 -1
  19. data/jobs/transforms/truncate_job.rb +1 -1
  20. data/lib/remi.rb +10 -15
  21. data/lib/remi/cucumber/business_rules.rb +23 -23
  22. data/lib/remi/cucumber/data_source.rb +2 -1
  23. data/lib/remi/data_frame.rb +36 -0
  24. data/lib/remi/data_frame/daru.rb +67 -0
  25. data/lib/remi/data_subject.rb +71 -10
  26. data/lib/remi/data_subject/csv_file.rb +151 -0
  27. data/lib/remi/data_subject/data_frame.rb +53 -0
  28. data/lib/remi/data_subject/postgres.rb +136 -0
  29. data/lib/remi/data_subject/salesforce.rb +136 -0
  30. data/lib/remi/data_subject/sftp_file.rb +66 -0
  31. data/lib/remi/fields.rb +8 -0
  32. data/lib/remi/source_to_target_map.rb +56 -32
  33. data/lib/remi/transform.rb +426 -83
  34. data/lib/remi/version.rb +1 -1
  35. data/remi.gemspec +2 -1
  36. data/spec/metadata_spec.rb +62 -0
  37. metadata +15 -28
  38. data/lib/remi/data_source.rb +0 -13
  39. data/lib/remi/data_source/csv_file.rb +0 -101
  40. data/lib/remi/data_source/data_frame.rb +0 -16
  41. data/lib/remi/data_source/postgres.rb +0 -58
  42. data/lib/remi/data_source/salesforce.rb +0 -87
  43. data/lib/remi/data_target.rb +0 -15
  44. data/lib/remi/data_target/csv_file.rb +0 -42
  45. data/lib/remi/data_target/data_frame.rb +0 -14
  46. data/lib/remi/data_target/postgres.rb +0 -74
  47. data/lib/remi/data_target/salesforce.rb +0 -54
  48. data/lib/remi/data_target/sftp_file.rb +0 -54
  49. data/lib/remi/refinements/daru.rb +0 -85
@@ -1,15 +0,0 @@
1
- module Remi
2
- module DataTarget
3
- include DataSubject
4
-
5
- # Gets called automatically at the end of a job, but could
6
- # also get manually called at the end of a transform so make
7
- # sure it doesn't do it twice.
8
- def load
9
- @logger.info "Loading target"
10
- return true if @loaded
11
- @loaded = true
12
- raise "Load function undefined for #{self.class.name}"
13
- end
14
- end
15
- end
@@ -1,42 +0,0 @@
1
- module Remi
2
- module DataTarget
3
- class CsvFile
4
- include DataTarget
5
-
6
- def self.default_csv_options
7
- CSV::DEFAULT_OPTIONS.merge({
8
- headers: true,
9
- header_converters: Remi::FieldSymbolizers[:standard],
10
- col_sep: ',',
11
- encoding: 'UTF-8',
12
- quote_char: '"',
13
- row_sep: "\n"
14
- })
15
- end
16
-
17
- def initialize(path:, csv_options: {}, logger: Remi::Settings.logger)
18
- @path = path
19
- @csv_options = self.class.default_csv_options.merge(csv_options)
20
- @logger = logger
21
- end
22
-
23
- attr_reader :path
24
- attr_reader :csv_options
25
-
26
- def field_symbolizer
27
- self.class.default_csv_options[:header_converters]
28
- end
29
-
30
- def load
31
- return true if @loaded || df.size == 0
32
-
33
- @logger.info "Writing CSV file #{@path}"
34
-
35
- df.write_csv @path, @csv_options
36
-
37
- @loaded = true
38
- end
39
-
40
- end
41
- end
42
- end
@@ -1,14 +0,0 @@
1
- module Remi
2
- module DataTarget
3
- class DataFrame
4
- include DataSubject
5
-
6
- def initialize(**args)
7
- end
8
-
9
- def load
10
- true
11
- end
12
- end
13
- end
14
- end
@@ -1,74 +0,0 @@
1
- module Remi
2
- module DataTarget
3
- class Postgres
4
- include DataTarget
5
-
6
- def initialize(credentials:, table_name:, fields:, logger: Remi::Settings.logger)
7
- @credentials = credentials
8
- @table_name = table_name
9
- @fields = fields
10
- @logger = logger
11
- end
12
-
13
- def load
14
- return true if @loaded || df.size == 0
15
-
16
- @logger.info "Performing postgres load to table #{@table_name}"
17
- create_target_table
18
- load_target_table
19
-
20
- @loaded = true
21
- end
22
-
23
-
24
- def connection
25
- @connection ||= PG.connect(
26
- host: @credentials[:host] || 'localhost',
27
- port: @credentials[:port] || 5432,
28
- dbname: @credentials[:dbname],
29
- user: @credentials[:user] || `whoami`.chomp,
30
- password: @credentials[:password],
31
- sslmode: @credentials[:sslmode] || 'allow'
32
- )
33
- end
34
-
35
-
36
- def fields_with_type_ddl
37
- @fields.map { |k,v| "#{k} #{v[:type]}" }.join(', ')
38
- end
39
-
40
- def create_target_table
41
- connection.exec <<-EOT
42
- CREATE TEMPORARY TABLE #{@table_name} (
43
- #{fields_with_type_ddl}
44
- )
45
- EOT
46
- end
47
-
48
- def load_target_table
49
- connection.copy_data "COPY #{@table_name} (#{@fields.keys.join(', ')}) FROM STDIN" do
50
- df.each(:row) do |row|
51
- row_str = @fields.keys.map do |field|
52
- field = row[field]
53
- case
54
- when field.respond_to?(:strftime)
55
- field.strftime('%Y-%m-%d %H:%M:%S')
56
- when field.respond_to?(:map)
57
- field.to_json.gsub("\t", '\t')
58
- when field.blank? && !field.nil?
59
- ''
60
- when field.nil?
61
- '\N'
62
- else
63
- field.to_s.gsub("\t", '\t')
64
- end
65
- end.join("\t")
66
-
67
- connection.put_copy_data row_str + "\n"
68
- end
69
- end
70
- end
71
-
72
- end
73
- end
74
- end
@@ -1,54 +0,0 @@
1
- module Remi
2
- module DataTarget
3
- class Salesforce
4
- include DataTarget
5
-
6
- def initialize(object:, operation:, credentials:, external_id: 'Id', api: :bulk, logger: Remi::Settings.logger)
7
- @sfo = object
8
- @operation = operation
9
- @external_id = external_id
10
- @credentials = credentials
11
- @api = api
12
- @logger = logger
13
- end
14
-
15
- def field_symbolizer
16
- Remi::FieldSymbolizers[:salesforce]
17
- end
18
-
19
- def load
20
- return true if @loaded || df.size == 0
21
-
22
- @logger.info "Performing Salesforce #{@operation} on object #{@sfo}"
23
-
24
- if @operation == :update
25
- Remi::SfBulkHelper::SfBulkUpdate.update(restforce_client, @sfo, df_as_array_of_hashes, logger: @logger)
26
- elsif @operation == :create
27
- Remi::SfBulkHelper::SfBulkCreate.create(restforce_client, @sfo, df_as_array_of_hashes, logger: @logger)
28
- elsif @operation == :upsert
29
- Remi::SfBulkHelper::SfBulkUpsert.upsert(restforce_client, @sfo, df_as_array_of_hashes, external_id: @external_id, logger: @logger)
30
- else
31
- raise "Unknown operation: #{@operation}"
32
- end
33
-
34
- @loaded = true
35
- end
36
-
37
- def restforce_client
38
- @restforce_client ||= begin
39
- client = Restforce.new(@credentials)
40
-
41
- #run a dummy query to initiate a connection. Workaround for Bulk API problem
42
- # https://github.com/yatish27/salesforce_bulk_api/issues/33
43
- client.query('SELECT Id FROM Contact LIMIT 1')
44
- client
45
- end
46
- end
47
-
48
- def df_as_array_of_hashes
49
- df.to_a[0]
50
- end
51
-
52
- end
53
- end
54
- end
@@ -1,54 +0,0 @@
1
- module Remi
2
- module DataTarget
3
- class SftpFile
4
- include DataTarget
5
-
6
- def initialize(credentials:, local_path:, remote_path: File.basename(local_path), logger: Remi::Settings.logger)
7
- @credentials = credentials
8
- @local_path = local_path
9
- @remote_path = remote_path
10
- @logger = logger
11
- end
12
-
13
- attr_reader :local_path
14
- attr_reader :remote_path
15
-
16
- def load
17
- return true if @loaded
18
-
19
- connection do |sftp|
20
- retry_upload { sftp.upload! @local_path, @remote_path }
21
- end
22
-
23
- @loaded = true
24
- end
25
-
26
-
27
-
28
- private
29
-
30
- def connection(&block)
31
- result = nil
32
- Net::SFTP.start(@credentials[:host], @credentials[:username], password: @credentials[:password], port: @credentials[:port] || '22') do |sftp|
33
- result = yield sftp
34
- end
35
- result
36
- end
37
-
38
- def retry_upload(ntry=2, &block)
39
- 1.upto(ntry).each do |itry|
40
- begin
41
- block.call
42
- rescue RuntimeError => err
43
- raise err unless itry < ntry
44
- @logger.error "Upload failed with error: #{err.message}"
45
- @logger.error "Retry attempt #{itry}/#{ntry-1}"
46
- sleep(1)
47
- end
48
- end
49
- end
50
-
51
-
52
- end
53
- end
54
- end
@@ -1,85 +0,0 @@
1
- module Remi
2
- module Refinements
3
- module Daru
4
- refine ::Daru::DataFrame do
5
-
6
- # Public: Fixes dup issues in the Daru library (vectors not being duped).
7
- def dup
8
- dupdf = ::Daru::DataFrame.new([], index: self.index)
9
- self.vectors.each do |v|
10
- dupdf[v] = self[v]
11
- end
12
-
13
- dupdf
14
- end
15
-
16
- # Public: Allows for combining dataframes with different columns
17
- def concat other_df
18
- vectors = @vectors.to_a
19
- data = []
20
-
21
- vectors.each do |v|
22
- other_vec = other_df.vectors.include?(v) ? other_df[v].to_a : [nil] * other_df.size
23
- data << self[v].dup.to_a.concat(other_vec)
24
- end
25
-
26
- other_df.vectors.each do |v|
27
- next if vectors.include?(v)
28
- vectors << v
29
- data << ([nil] * self.size).concat(other_df[v].to_a)
30
- end
31
-
32
- ::Daru::DataFrame.new(data, order: vectors)
33
- end
34
-
35
- # Public: Saves a Dataframe to a file.
36
- def hash_dump(filename)
37
- File.binwrite(filename, Marshal.dump(self.to_hash))
38
- end
39
-
40
- # Public: Allows the user to define an arbitrary aggregation function.
41
- #
42
- # by - The name of the DataFrame vector to use to group records.
43
- # func - A lambda function that accepts three arguments - the
44
- # first argument is the DataFrame, the second is the
45
- # key to the current group, and the third is the index
46
- # of the elements belonging to a group.
47
- #
48
- # Example:
49
- # df = Daru::DataFrame.new( { a: ['a','a','a','b','b'], year: ['2018','2015','2019', '2014', '2013'] })
50
- #
51
- # mymin = lambda do |field, df, group_key, indices|
52
- # values = indices.map { |idx| df.row[idx][field] }
53
- # "Group #{group_key} has a minimum value of #{values.min}"
54
- # end
55
- #
56
- # df.aggregate(by: :a, func: mymin.curry.(:year))
57
- #
58
- #
59
- # Returns a Daru::Vector.
60
- def aggregate(by:, func:)
61
- grouped = self.group_by(by)
62
- df_indices = self.index.to_a
63
- ::Daru::Vector.new(
64
- grouped.groups.reduce({}) do |h, (key, indices)|
65
- # Daru groups don't use the index of the dataframe when returning groups (WTF?).
66
- # Instead they return the position of the record in the dataframe. Here, we
67
- group_df_indices = indices.map { |v| df_indices[v] }
68
- group_key = key.size == 1 ? key.first : key
69
- h[group_key] = func.(self, group_key, group_df_indices)
70
- h
71
- end
72
- )
73
- end
74
-
75
- end
76
-
77
- refine ::Daru::DataFrame.singleton_class do
78
- # Public: Creates a DataFrame by reading the dumped version from a file.
79
- def from_hash_dump(filename)
80
- ::Daru::DataFrame.new(Marshal.load(File.binread(filename)))
81
- end
82
- end
83
- end
84
- end
85
- end