remi 0.2.27 → 0.2.28
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile +3 -0
- data/Gemfile.lock +34 -5
- data/features/metadata.feature +17 -0
- data/features/step_definitions/remi_step.rb +6 -6
- data/features/transforms/date_diff.feature +1 -0
- data/jobs/aggregate_job.rb +0 -1
- data/jobs/all_jobs_shared.rb +0 -2
- data/jobs/copy_source_job.rb +0 -1
- data/jobs/csv_file_target_job.rb +0 -1
- data/jobs/metadata_job.rb +60 -0
- data/jobs/parameters_job.rb +1 -1
- data/jobs/sample_job.rb +19 -20
- data/jobs/sftp_file_target_job.rb +0 -1
- data/jobs/transforms/date_diff_job.rb +1 -1
- data/jobs/transforms/nvl_job.rb +1 -1
- data/jobs/transforms/parse_date_job.rb +7 -4
- data/jobs/transforms/prefix_job.rb +1 -1
- data/jobs/transforms/truncate_job.rb +1 -1
- data/lib/remi.rb +10 -15
- data/lib/remi/cucumber/business_rules.rb +23 -23
- data/lib/remi/cucumber/data_source.rb +2 -1
- data/lib/remi/data_frame.rb +36 -0
- data/lib/remi/data_frame/daru.rb +67 -0
- data/lib/remi/data_subject.rb +71 -10
- data/lib/remi/data_subject/csv_file.rb +151 -0
- data/lib/remi/data_subject/data_frame.rb +53 -0
- data/lib/remi/data_subject/postgres.rb +136 -0
- data/lib/remi/data_subject/salesforce.rb +136 -0
- data/lib/remi/data_subject/sftp_file.rb +66 -0
- data/lib/remi/fields.rb +8 -0
- data/lib/remi/source_to_target_map.rb +56 -32
- data/lib/remi/transform.rb +426 -83
- data/lib/remi/version.rb +1 -1
- data/remi.gemspec +2 -1
- data/spec/metadata_spec.rb +62 -0
- metadata +15 -28
- data/lib/remi/data_source.rb +0 -13
- data/lib/remi/data_source/csv_file.rb +0 -101
- data/lib/remi/data_source/data_frame.rb +0 -16
- data/lib/remi/data_source/postgres.rb +0 -58
- data/lib/remi/data_source/salesforce.rb +0 -87
- data/lib/remi/data_target.rb +0 -15
- data/lib/remi/data_target/csv_file.rb +0 -42
- data/lib/remi/data_target/data_frame.rb +0 -14
- data/lib/remi/data_target/postgres.rb +0 -74
- data/lib/remi/data_target/salesforce.rb +0 -54
- data/lib/remi/data_target/sftp_file.rb +0 -54
- data/lib/remi/refinements/daru.rb +0 -85
data/lib/remi/data_target.rb
DELETED
@@ -1,15 +0,0 @@
|
|
1
|
-
module Remi
|
2
|
-
module DataTarget
|
3
|
-
include DataSubject
|
4
|
-
|
5
|
-
# Gets called automatically at the end of a job, but could
|
6
|
-
# also get manually called at the end of a transform so make
|
7
|
-
# sure it doesn't do it twice.
|
8
|
-
def load
|
9
|
-
@logger.info "Loading target"
|
10
|
-
return true if @loaded
|
11
|
-
@loaded = true
|
12
|
-
raise "Load function undefined for #{self.class.name}"
|
13
|
-
end
|
14
|
-
end
|
15
|
-
end
|
@@ -1,42 +0,0 @@
|
|
1
|
-
module Remi
|
2
|
-
module DataTarget
|
3
|
-
class CsvFile
|
4
|
-
include DataTarget
|
5
|
-
|
6
|
-
def self.default_csv_options
|
7
|
-
CSV::DEFAULT_OPTIONS.merge({
|
8
|
-
headers: true,
|
9
|
-
header_converters: Remi::FieldSymbolizers[:standard],
|
10
|
-
col_sep: ',',
|
11
|
-
encoding: 'UTF-8',
|
12
|
-
quote_char: '"',
|
13
|
-
row_sep: "\n"
|
14
|
-
})
|
15
|
-
end
|
16
|
-
|
17
|
-
def initialize(path:, csv_options: {}, logger: Remi::Settings.logger)
|
18
|
-
@path = path
|
19
|
-
@csv_options = self.class.default_csv_options.merge(csv_options)
|
20
|
-
@logger = logger
|
21
|
-
end
|
22
|
-
|
23
|
-
attr_reader :path
|
24
|
-
attr_reader :csv_options
|
25
|
-
|
26
|
-
def field_symbolizer
|
27
|
-
self.class.default_csv_options[:header_converters]
|
28
|
-
end
|
29
|
-
|
30
|
-
def load
|
31
|
-
return true if @loaded || df.size == 0
|
32
|
-
|
33
|
-
@logger.info "Writing CSV file #{@path}"
|
34
|
-
|
35
|
-
df.write_csv @path, @csv_options
|
36
|
-
|
37
|
-
@loaded = true
|
38
|
-
end
|
39
|
-
|
40
|
-
end
|
41
|
-
end
|
42
|
-
end
|
@@ -1,74 +0,0 @@
|
|
1
|
-
module Remi
|
2
|
-
module DataTarget
|
3
|
-
class Postgres
|
4
|
-
include DataTarget
|
5
|
-
|
6
|
-
def initialize(credentials:, table_name:, fields:, logger: Remi::Settings.logger)
|
7
|
-
@credentials = credentials
|
8
|
-
@table_name = table_name
|
9
|
-
@fields = fields
|
10
|
-
@logger = logger
|
11
|
-
end
|
12
|
-
|
13
|
-
def load
|
14
|
-
return true if @loaded || df.size == 0
|
15
|
-
|
16
|
-
@logger.info "Performing postgres load to table #{@table_name}"
|
17
|
-
create_target_table
|
18
|
-
load_target_table
|
19
|
-
|
20
|
-
@loaded = true
|
21
|
-
end
|
22
|
-
|
23
|
-
|
24
|
-
def connection
|
25
|
-
@connection ||= PG.connect(
|
26
|
-
host: @credentials[:host] || 'localhost',
|
27
|
-
port: @credentials[:port] || 5432,
|
28
|
-
dbname: @credentials[:dbname],
|
29
|
-
user: @credentials[:user] || `whoami`.chomp,
|
30
|
-
password: @credentials[:password],
|
31
|
-
sslmode: @credentials[:sslmode] || 'allow'
|
32
|
-
)
|
33
|
-
end
|
34
|
-
|
35
|
-
|
36
|
-
def fields_with_type_ddl
|
37
|
-
@fields.map { |k,v| "#{k} #{v[:type]}" }.join(', ')
|
38
|
-
end
|
39
|
-
|
40
|
-
def create_target_table
|
41
|
-
connection.exec <<-EOT
|
42
|
-
CREATE TEMPORARY TABLE #{@table_name} (
|
43
|
-
#{fields_with_type_ddl}
|
44
|
-
)
|
45
|
-
EOT
|
46
|
-
end
|
47
|
-
|
48
|
-
def load_target_table
|
49
|
-
connection.copy_data "COPY #{@table_name} (#{@fields.keys.join(', ')}) FROM STDIN" do
|
50
|
-
df.each(:row) do |row|
|
51
|
-
row_str = @fields.keys.map do |field|
|
52
|
-
field = row[field]
|
53
|
-
case
|
54
|
-
when field.respond_to?(:strftime)
|
55
|
-
field.strftime('%Y-%m-%d %H:%M:%S')
|
56
|
-
when field.respond_to?(:map)
|
57
|
-
field.to_json.gsub("\t", '\t')
|
58
|
-
when field.blank? && !field.nil?
|
59
|
-
''
|
60
|
-
when field.nil?
|
61
|
-
'\N'
|
62
|
-
else
|
63
|
-
field.to_s.gsub("\t", '\t')
|
64
|
-
end
|
65
|
-
end.join("\t")
|
66
|
-
|
67
|
-
connection.put_copy_data row_str + "\n"
|
68
|
-
end
|
69
|
-
end
|
70
|
-
end
|
71
|
-
|
72
|
-
end
|
73
|
-
end
|
74
|
-
end
|
@@ -1,54 +0,0 @@
|
|
1
|
-
module Remi
|
2
|
-
module DataTarget
|
3
|
-
class Salesforce
|
4
|
-
include DataTarget
|
5
|
-
|
6
|
-
def initialize(object:, operation:, credentials:, external_id: 'Id', api: :bulk, logger: Remi::Settings.logger)
|
7
|
-
@sfo = object
|
8
|
-
@operation = operation
|
9
|
-
@external_id = external_id
|
10
|
-
@credentials = credentials
|
11
|
-
@api = api
|
12
|
-
@logger = logger
|
13
|
-
end
|
14
|
-
|
15
|
-
def field_symbolizer
|
16
|
-
Remi::FieldSymbolizers[:salesforce]
|
17
|
-
end
|
18
|
-
|
19
|
-
def load
|
20
|
-
return true if @loaded || df.size == 0
|
21
|
-
|
22
|
-
@logger.info "Performing Salesforce #{@operation} on object #{@sfo}"
|
23
|
-
|
24
|
-
if @operation == :update
|
25
|
-
Remi::SfBulkHelper::SfBulkUpdate.update(restforce_client, @sfo, df_as_array_of_hashes, logger: @logger)
|
26
|
-
elsif @operation == :create
|
27
|
-
Remi::SfBulkHelper::SfBulkCreate.create(restforce_client, @sfo, df_as_array_of_hashes, logger: @logger)
|
28
|
-
elsif @operation == :upsert
|
29
|
-
Remi::SfBulkHelper::SfBulkUpsert.upsert(restforce_client, @sfo, df_as_array_of_hashes, external_id: @external_id, logger: @logger)
|
30
|
-
else
|
31
|
-
raise "Unknown operation: #{@operation}"
|
32
|
-
end
|
33
|
-
|
34
|
-
@loaded = true
|
35
|
-
end
|
36
|
-
|
37
|
-
def restforce_client
|
38
|
-
@restforce_client ||= begin
|
39
|
-
client = Restforce.new(@credentials)
|
40
|
-
|
41
|
-
#run a dummy query to initiate a connection. Workaround for Bulk API problem
|
42
|
-
# https://github.com/yatish27/salesforce_bulk_api/issues/33
|
43
|
-
client.query('SELECT Id FROM Contact LIMIT 1')
|
44
|
-
client
|
45
|
-
end
|
46
|
-
end
|
47
|
-
|
48
|
-
def df_as_array_of_hashes
|
49
|
-
df.to_a[0]
|
50
|
-
end
|
51
|
-
|
52
|
-
end
|
53
|
-
end
|
54
|
-
end
|
@@ -1,54 +0,0 @@
|
|
1
|
-
module Remi
|
2
|
-
module DataTarget
|
3
|
-
class SftpFile
|
4
|
-
include DataTarget
|
5
|
-
|
6
|
-
def initialize(credentials:, local_path:, remote_path: File.basename(local_path), logger: Remi::Settings.logger)
|
7
|
-
@credentials = credentials
|
8
|
-
@local_path = local_path
|
9
|
-
@remote_path = remote_path
|
10
|
-
@logger = logger
|
11
|
-
end
|
12
|
-
|
13
|
-
attr_reader :local_path
|
14
|
-
attr_reader :remote_path
|
15
|
-
|
16
|
-
def load
|
17
|
-
return true if @loaded
|
18
|
-
|
19
|
-
connection do |sftp|
|
20
|
-
retry_upload { sftp.upload! @local_path, @remote_path }
|
21
|
-
end
|
22
|
-
|
23
|
-
@loaded = true
|
24
|
-
end
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
private
|
29
|
-
|
30
|
-
def connection(&block)
|
31
|
-
result = nil
|
32
|
-
Net::SFTP.start(@credentials[:host], @credentials[:username], password: @credentials[:password], port: @credentials[:port] || '22') do |sftp|
|
33
|
-
result = yield sftp
|
34
|
-
end
|
35
|
-
result
|
36
|
-
end
|
37
|
-
|
38
|
-
def retry_upload(ntry=2, &block)
|
39
|
-
1.upto(ntry).each do |itry|
|
40
|
-
begin
|
41
|
-
block.call
|
42
|
-
rescue RuntimeError => err
|
43
|
-
raise err unless itry < ntry
|
44
|
-
@logger.error "Upload failed with error: #{err.message}"
|
45
|
-
@logger.error "Retry attempt #{itry}/#{ntry-1}"
|
46
|
-
sleep(1)
|
47
|
-
end
|
48
|
-
end
|
49
|
-
end
|
50
|
-
|
51
|
-
|
52
|
-
end
|
53
|
-
end
|
54
|
-
end
|
@@ -1,85 +0,0 @@
|
|
1
|
-
module Remi
|
2
|
-
module Refinements
|
3
|
-
module Daru
|
4
|
-
refine ::Daru::DataFrame do
|
5
|
-
|
6
|
-
# Public: Fixes dup issues in the Daru library (vectors not being duped).
|
7
|
-
def dup
|
8
|
-
dupdf = ::Daru::DataFrame.new([], index: self.index)
|
9
|
-
self.vectors.each do |v|
|
10
|
-
dupdf[v] = self[v]
|
11
|
-
end
|
12
|
-
|
13
|
-
dupdf
|
14
|
-
end
|
15
|
-
|
16
|
-
# Public: Allows for combining dataframes with different columns
|
17
|
-
def concat other_df
|
18
|
-
vectors = @vectors.to_a
|
19
|
-
data = []
|
20
|
-
|
21
|
-
vectors.each do |v|
|
22
|
-
other_vec = other_df.vectors.include?(v) ? other_df[v].to_a : [nil] * other_df.size
|
23
|
-
data << self[v].dup.to_a.concat(other_vec)
|
24
|
-
end
|
25
|
-
|
26
|
-
other_df.vectors.each do |v|
|
27
|
-
next if vectors.include?(v)
|
28
|
-
vectors << v
|
29
|
-
data << ([nil] * self.size).concat(other_df[v].to_a)
|
30
|
-
end
|
31
|
-
|
32
|
-
::Daru::DataFrame.new(data, order: vectors)
|
33
|
-
end
|
34
|
-
|
35
|
-
# Public: Saves a Dataframe to a file.
|
36
|
-
def hash_dump(filename)
|
37
|
-
File.binwrite(filename, Marshal.dump(self.to_hash))
|
38
|
-
end
|
39
|
-
|
40
|
-
# Public: Allows the user to define an arbitrary aggregation function.
|
41
|
-
#
|
42
|
-
# by - The name of the DataFrame vector to use to group records.
|
43
|
-
# func - A lambda function that accepts three arguments - the
|
44
|
-
# first argument is the DataFrame, the second is the
|
45
|
-
# key to the current group, and the third is the index
|
46
|
-
# of the elements belonging to a group.
|
47
|
-
#
|
48
|
-
# Example:
|
49
|
-
# df = Daru::DataFrame.new( { a: ['a','a','a','b','b'], year: ['2018','2015','2019', '2014', '2013'] })
|
50
|
-
#
|
51
|
-
# mymin = lambda do |field, df, group_key, indices|
|
52
|
-
# values = indices.map { |idx| df.row[idx][field] }
|
53
|
-
# "Group #{group_key} has a minimum value of #{values.min}"
|
54
|
-
# end
|
55
|
-
#
|
56
|
-
# df.aggregate(by: :a, func: mymin.curry.(:year))
|
57
|
-
#
|
58
|
-
#
|
59
|
-
# Returns a Daru::Vector.
|
60
|
-
def aggregate(by:, func:)
|
61
|
-
grouped = self.group_by(by)
|
62
|
-
df_indices = self.index.to_a
|
63
|
-
::Daru::Vector.new(
|
64
|
-
grouped.groups.reduce({}) do |h, (key, indices)|
|
65
|
-
# Daru groups don't use the index of the dataframe when returning groups (WTF?).
|
66
|
-
# Instead they return the position of the record in the dataframe. Here, we
|
67
|
-
group_df_indices = indices.map { |v| df_indices[v] }
|
68
|
-
group_key = key.size == 1 ? key.first : key
|
69
|
-
h[group_key] = func.(self, group_key, group_df_indices)
|
70
|
-
h
|
71
|
-
end
|
72
|
-
)
|
73
|
-
end
|
74
|
-
|
75
|
-
end
|
76
|
-
|
77
|
-
refine ::Daru::DataFrame.singleton_class do
|
78
|
-
# Public: Creates a DataFrame by reading the dumped version from a file.
|
79
|
-
def from_hash_dump(filename)
|
80
|
-
::Daru::DataFrame.new(Marshal.load(File.binread(filename)))
|
81
|
-
end
|
82
|
-
end
|
83
|
-
end
|
84
|
-
end
|
85
|
-
end
|