remi 0.0.1 → 0.2.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.bundle/config +2 -0
- data/.gitignore +3 -2
- data/.rspec +2 -0
- data/.ruby-version +1 -0
- data/Gemfile +4 -0
- data/Gemfile.lock +123 -0
- data/LICENSE.txt +21 -0
- data/README.md +94 -3
- data/bin/remi +8 -0
- data/doc/install-rbenv-os_x.md +47 -0
- data/lib/remi.rb +56 -9
- data/lib/remi/cli.rb +56 -0
- data/lib/remi/core/daru.rb +28 -0
- data/lib/remi/core/refinements.rb +21 -0
- data/lib/remi/core/string.rb +8 -0
- data/lib/remi/cucumber.rb +7 -0
- data/lib/remi/cucumber/business_rules.rb +504 -0
- data/lib/remi/cucumber/data_source.rb +63 -0
- data/lib/remi/data_source.rb +13 -0
- data/lib/remi/data_source/csv_file.rb +79 -0
- data/lib/remi/data_source/data_frame.rb +10 -0
- data/lib/remi/data_source/postgres.rb +58 -0
- data/lib/remi/data_source/salesforce.rb +78 -0
- data/lib/remi/data_subject.rb +25 -0
- data/lib/remi/data_target.rb +15 -0
- data/lib/remi/data_target/csv_file.rb +49 -0
- data/lib/remi/data_target/data_frame.rb +14 -0
- data/lib/remi/data_target/salesforce.rb +49 -0
- data/lib/remi/extractor/sftp_file.rb +84 -0
- data/lib/remi/field_symbolizers.rb +17 -0
- data/lib/remi/job.rb +200 -0
- data/lib/remi/lookup/regex_sieve.rb +55 -0
- data/lib/remi/project/features/examples.feature +24 -0
- data/lib/remi/project/features/formulas.feature +64 -0
- data/lib/remi/project/features/sample_job.feature +304 -0
- data/lib/remi/project/features/step_definitions/remi_step.rb +310 -0
- data/lib/remi/project/features/support/env.rb +10 -0
- data/lib/remi/project/features/support/env_app.rb +3 -0
- data/lib/remi/project/features/transforms/date_diff.feature +50 -0
- data/lib/remi/project/features/transforms/parse_date.feature +34 -0
- data/lib/remi/project/features/transforms/prefix.feature +15 -0
- data/lib/remi/project/jobs/all_jobs_shared.rb +25 -0
- data/lib/remi/project/jobs/copy_source_job.rb +12 -0
- data/lib/remi/project/jobs/sample_job.rb +164 -0
- data/lib/remi/project/jobs/transforms/date_diff_job.rb +17 -0
- data/lib/remi/project/jobs/transforms/parse_date_job.rb +18 -0
- data/lib/remi/project/jobs/transforms/prefix_job.rb +16 -0
- data/lib/remi/project/jobs/transforms/transform_jobs.rb +3 -0
- data/lib/remi/settings.rb +39 -0
- data/lib/remi/sf_bulk_helper.rb +265 -0
- data/lib/remi/source_to_target_map.rb +93 -0
- data/lib/remi/transform.rb +137 -0
- data/lib/remi/version.rb +3 -0
- data/remi.gemspec +25 -7
- data/workbooks/sample_workbook.ipynb +56 -0
- data/workbooks/workbook_helper.rb +1 -0
- metadata +234 -17
- data/lib/noodling.rb +0 -163
- data/test/test_NAME.rb +0 -19
@@ -0,0 +1,63 @@
|
|
1
|
+
module Remi
|
2
|
+
module DataSource
|
3
|
+
module DataStub
|
4
|
+
def stub_row_array
|
5
|
+
@fields.values.map do |attrib|
|
6
|
+
case attrib[:type]
|
7
|
+
when :date
|
8
|
+
stub_values[:date].strftime(attrib[:format])
|
9
|
+
when nil
|
10
|
+
stub_values[:string]
|
11
|
+
else
|
12
|
+
stub_values[attrib[:type]]
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
def stub_df
|
18
|
+
wdf = Daru::DataFrame.new([], order: @fields.keys)
|
19
|
+
wdf.add_row(stub_row_array)
|
20
|
+
self.df = wdf
|
21
|
+
end
|
22
|
+
|
23
|
+
def stub_values
|
24
|
+
@stub_values ||= {
|
25
|
+
string: "Some String",
|
26
|
+
number: 133,
|
27
|
+
float: 3.14159,
|
28
|
+
integer: 38,
|
29
|
+
date: Date.parse('2015-10-21')
|
30
|
+
}
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
|
35
|
+
class CsvFile
|
36
|
+
include DataStub
|
37
|
+
def stub_tmp_file
|
38
|
+
@stub_tmp_file ||= Tempfile.new('stub_tmp_file.csv').path
|
39
|
+
end
|
40
|
+
|
41
|
+
def write_stub_tmp_file
|
42
|
+
File.open(stub_tmp_file, "wb") do |file|
|
43
|
+
file.puts stub_header
|
44
|
+
file.puts stub_row_csv
|
45
|
+
end
|
46
|
+
|
47
|
+
stub_tmp_file
|
48
|
+
end
|
49
|
+
|
50
|
+
def stub_header
|
51
|
+
@fields.keys.join(@csv_options[:col_sep])
|
52
|
+
end
|
53
|
+
|
54
|
+
def stub_row_csv
|
55
|
+
stub_row_array.join(@csv_options[:col_sep])
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
class Salesforce
|
60
|
+
include DataStub
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
@@ -0,0 +1,79 @@
|
|
1
|
+
module Remi
|
2
|
+
module DataSource
|
3
|
+
class CsvFile
|
4
|
+
include DataSource
|
5
|
+
|
6
|
+
def self.default_csv_options
|
7
|
+
CSV::DEFAULT_OPTIONS.merge({
|
8
|
+
headers: true,
|
9
|
+
header_converters: Remi::FieldSymbolizers[:standard],
|
10
|
+
col_sep: ',',
|
11
|
+
encoding: 'UTF-8',
|
12
|
+
quote_char: '"'
|
13
|
+
})
|
14
|
+
end
|
15
|
+
|
16
|
+
|
17
|
+
def initialize(fields: {}, extractor:, csv_options: {}, logger: Remi::Settings.logger)
|
18
|
+
@fields = fields
|
19
|
+
self.extractor = extractor
|
20
|
+
@csv_options = self.class.default_csv_options.merge(csv_options)
|
21
|
+
@logger = logger
|
22
|
+
end
|
23
|
+
|
24
|
+
attr_accessor :fields
|
25
|
+
attr_reader :extractor
|
26
|
+
attr_reader :csv_options
|
27
|
+
|
28
|
+
def field_symbolizer
|
29
|
+
self.class.default_csv_options[:header_converters]
|
30
|
+
end
|
31
|
+
|
32
|
+
def extract
|
33
|
+
Array(@extractor.extract).tap { |x| raise "Multiple files not supported" if x.size > 1 }
|
34
|
+
end
|
35
|
+
|
36
|
+
def extractor=(arg)
|
37
|
+
case arg
|
38
|
+
when Extractor::SftpFile, Extractor::LocalFile
|
39
|
+
@extractor = arg
|
40
|
+
when String
|
41
|
+
@extractor = Extractor::LocalFile.new(arg)
|
42
|
+
when Regexp
|
43
|
+
raise "Adding regex matching to local files would be easy, not done yet"
|
44
|
+
else
|
45
|
+
raise "Unknown extractor of type #{arg.class}: #{arg}"
|
46
|
+
end
|
47
|
+
end
|
48
|
+
|
49
|
+
# Only going to support single file for now
|
50
|
+
def source_filename
|
51
|
+
@source_filename ||= extract.first
|
52
|
+
end
|
53
|
+
|
54
|
+
def first_line
|
55
|
+
# Readline assumes \n line endings. Strip out \r if it is a DOS file.
|
56
|
+
@first_line ||= File.open(source_filename) do |f|
|
57
|
+
f.readline.gsub(/\r/,'')
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
def headers
|
62
|
+
@headers ||= CSV.open(source_filename, 'r', source_csv_options) { |csv| csv.first }.headers
|
63
|
+
end
|
64
|
+
|
65
|
+
def valid_headers?
|
66
|
+
(fields.keys - headers).empty?
|
67
|
+
end
|
68
|
+
|
69
|
+
def to_dataframe
|
70
|
+
@logger.info "Converting #{source_filename} to a dataframe"
|
71
|
+
Daru::DataFrame.from_csv source_filename, @csv_options
|
72
|
+
end
|
73
|
+
|
74
|
+
def df
|
75
|
+
@dataframe ||= to_dataframe
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
@@ -0,0 +1,58 @@
|
|
1
|
+
module Remi
|
2
|
+
module DataSource
|
3
|
+
class Postgres
|
4
|
+
include DataSource
|
5
|
+
|
6
|
+
def initialize(fields: {}, credentials:, query:, logger: Remi::Settings.logger)
|
7
|
+
@fields = fields
|
8
|
+
@credentials = credentials
|
9
|
+
@query = query
|
10
|
+
@logger = logger
|
11
|
+
end
|
12
|
+
|
13
|
+
attr_accessor :fields
|
14
|
+
|
15
|
+
def extract
|
16
|
+
@logger.info "Executing query #{@query}"
|
17
|
+
@raw_result = pg_conn.exec @query
|
18
|
+
end
|
19
|
+
|
20
|
+
def raw_result
|
21
|
+
@raw_result ||= extract
|
22
|
+
end
|
23
|
+
|
24
|
+
def pg_conn
|
25
|
+
@pg_conn ||= PG.connect(
|
26
|
+
host: @credentials[:host] || 'localhost',
|
27
|
+
port: @credentials[:port] || 5432,
|
28
|
+
dbname: @credentials[:dbname],
|
29
|
+
user: @credentials[:user] || `whoami`.chomp,
|
30
|
+
password: @credentials[:password],
|
31
|
+
sslmode: @credentials[:sslmode] || 'require'
|
32
|
+
)
|
33
|
+
end
|
34
|
+
|
35
|
+
|
36
|
+
def to_dataframe
|
37
|
+
# Performance for larger sets could be improved by using bulk query (via COPY)
|
38
|
+
@logger.info "Converting query to a dataframe"
|
39
|
+
|
40
|
+
hash_array = {}
|
41
|
+
raw_result.each do |row|
|
42
|
+
row.each do |field, value|
|
43
|
+
(hash_array[field_symbolizer.call(field)] ||= []) << value
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
# After converting to DF, clear the PG results to save memory.
|
48
|
+
raw_result.clear
|
49
|
+
|
50
|
+
Daru::DataFrame.new hash_array, order: hash_array.keys
|
51
|
+
end
|
52
|
+
|
53
|
+
def df
|
54
|
+
@dataframe ||= to_dataframe
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
@@ -0,0 +1,78 @@
|
|
1
|
+
module Remi
|
2
|
+
module DataSource
|
3
|
+
class Salesforce
|
4
|
+
include DataSource
|
5
|
+
|
6
|
+
def initialize(fields: {}, object:, query:, credentials:, api: :bulk, logger: Remi::Settings.logger)
|
7
|
+
@fields = fields
|
8
|
+
@sfo = object
|
9
|
+
@query = query
|
10
|
+
@credentials = credentials
|
11
|
+
@api = api
|
12
|
+
@logger = logger
|
13
|
+
end
|
14
|
+
|
15
|
+
attr_accessor :fields
|
16
|
+
attr_accessor :raw_result
|
17
|
+
|
18
|
+
def field_symbolizer
|
19
|
+
Remi::FieldSymbolizers[:salesforce]
|
20
|
+
end
|
21
|
+
|
22
|
+
def extract
|
23
|
+
@raw_result = sf_bulk.query(@sfo, @query, 10000)
|
24
|
+
end
|
25
|
+
|
26
|
+
def raw_result
|
27
|
+
@raw_result ||= extract
|
28
|
+
end
|
29
|
+
|
30
|
+
|
31
|
+
|
32
|
+
def restforce_client
|
33
|
+
@restforce_client ||= begin
|
34
|
+
client = Restforce.new(@credentials)
|
35
|
+
|
36
|
+
#run a dummy query to initiate a connection. Workaround for Bulk API problem
|
37
|
+
# https://github.com/yatish27/salesforce_bulk_api/issues/33
|
38
|
+
client.query('SELECT Id FROM Contact LIMIT 1')
|
39
|
+
client
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
def sf_bulk
|
44
|
+
@sf_bulk ||= SalesforceBulkApi::Api.new(restforce_client).tap { |o| o.connection.set_status_throttle(5) }
|
45
|
+
end
|
46
|
+
|
47
|
+
def to_dataframe
|
48
|
+
@logger.info "Converting salesforce query results to a dataframe"
|
49
|
+
|
50
|
+
hash_array = {}
|
51
|
+
raw_result['batches'].each do |batch|
|
52
|
+
next unless batch['response']
|
53
|
+
|
54
|
+
batch['response'].each do |record|
|
55
|
+
record.each do |field, value|
|
56
|
+
next if ['xsi:type','type'].include? field
|
57
|
+
(hash_array[field.to_sym] ||= []) << case value.first
|
58
|
+
when Hash
|
59
|
+
value.first["xsi:nil"] == "true" ? nil : value.first
|
60
|
+
else
|
61
|
+
value.first
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
# delete raw result at end of processing to free memory
|
67
|
+
batch['response'] = nil
|
68
|
+
end
|
69
|
+
|
70
|
+
Daru::DataFrame.new hash_array, order: hash_array.keys
|
71
|
+
end
|
72
|
+
|
73
|
+
def df
|
74
|
+
@dataframe ||= to_dataframe
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
module Remi
|
2
|
+
module DataSubject
|
3
|
+
def field_symbolizer
|
4
|
+
Remi::FieldSymbolizers[:standard]
|
5
|
+
end
|
6
|
+
|
7
|
+
def df
|
8
|
+
@dataframe ||= Daru::DataFrame.new([])
|
9
|
+
end
|
10
|
+
|
11
|
+
def df=(new_dataframe)
|
12
|
+
@dataframe = new_dataframe
|
13
|
+
end
|
14
|
+
|
15
|
+
# Fields is a hash where the keys are the data field names and the values
|
16
|
+
# are a hash of metadata. DataFrames do not currently support metadata,
|
17
|
+
# so the metdata will be empty unless overridden by the specific target.
|
18
|
+
def fields
|
19
|
+
df.vectors.to_a.reduce({}) do |h, v|
|
20
|
+
h[v] = {}
|
21
|
+
h
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
module Remi
|
2
|
+
module DataTarget
|
3
|
+
include DataSubject
|
4
|
+
|
5
|
+
# Gets called automatically at the end of a job, but could
|
6
|
+
# also get manually called at the end of a transform so make
|
7
|
+
# sure it doesn't do it twice.
|
8
|
+
def load
|
9
|
+
@logger.info "Loading target"
|
10
|
+
return true if @loaded
|
11
|
+
@loaded = true
|
12
|
+
raise "Load function undefined for #{self.class.name}"
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
module Remi
|
2
|
+
module DataTarget
|
3
|
+
class Salesforce
|
4
|
+
include DataTarget
|
5
|
+
|
6
|
+
def initialize(object:, operation:, credentials:, api: :bulk, logger: Remi::Settings.logger)
|
7
|
+
@sfo = object
|
8
|
+
@operation = operation
|
9
|
+
@credentials = credentials
|
10
|
+
@api = api
|
11
|
+
@logger = logger
|
12
|
+
end
|
13
|
+
|
14
|
+
def field_symbolizer
|
15
|
+
Remi::FieldSymbolizers[:salesforce]
|
16
|
+
end
|
17
|
+
|
18
|
+
def load
|
19
|
+
return true if @loaded || df.size == 0
|
20
|
+
|
21
|
+
@logger.info "Performing Salesforce #{@operation} on object #{@sfo}"
|
22
|
+
|
23
|
+
if @operation == :update
|
24
|
+
Remi::SfBulkHelper::SfBulkUpdate.update(restforce_client, @sfo, df_as_array_of_hashes, logger: @logger)
|
25
|
+
elsif @operation == :create
|
26
|
+
Remi::SfBulkHelper::SfBulkCreate.create(restforce_client, @sfo, df_as_array_of_hashes, logger: @logger)
|
27
|
+
end
|
28
|
+
|
29
|
+
@loaded = true
|
30
|
+
end
|
31
|
+
|
32
|
+
def restforce_client
|
33
|
+
@restforce_client ||= begin
|
34
|
+
client = Restforce.new(@credentials)
|
35
|
+
|
36
|
+
#run a dummy query to initiate a connection. Workaround for Bulk API problem
|
37
|
+
# https://github.com/yatish27/salesforce_bulk_api/issues/33
|
38
|
+
client.query('SELECT Id FROM Contact LIMIT 1')
|
39
|
+
client
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
def df_as_array_of_hashes
|
44
|
+
df.to_a[0]
|
45
|
+
end
|
46
|
+
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
module Remi
|
2
|
+
module DataTarget
|
3
|
+
class Salesforce
|
4
|
+
include DataTarget
|
5
|
+
|
6
|
+
def initialize(object:, operation:, credentials:, api: :bulk, logger: Remi::Settings.logger)
|
7
|
+
@sfo = object
|
8
|
+
@operation = operation
|
9
|
+
@credentials = credentials
|
10
|
+
@api = api
|
11
|
+
@logger = logger
|
12
|
+
end
|
13
|
+
|
14
|
+
def field_symbolizer
|
15
|
+
Remi::FieldSymbolizers[:salesforce]
|
16
|
+
end
|
17
|
+
|
18
|
+
def load
|
19
|
+
return true if @loaded || df.size == 0
|
20
|
+
|
21
|
+
@logger.info "Performing Salesforce #{@operation} on object #{@sfo}"
|
22
|
+
|
23
|
+
if @operation == :update
|
24
|
+
Remi::SfBulkHelper::SfBulkUpdate.update(restforce_client, @sfo, df_as_array_of_hashes, logger: @logger)
|
25
|
+
elsif @operation == :create
|
26
|
+
Remi::SfBulkHelper::SfBulkCreate.create(restforce_client, @sfo, df_as_array_of_hashes, logger: @logger)
|
27
|
+
end
|
28
|
+
|
29
|
+
@loaded = true
|
30
|
+
end
|
31
|
+
|
32
|
+
def restforce_client
|
33
|
+
@restforce_client ||= begin
|
34
|
+
client = Restforce.new(@credentials)
|
35
|
+
|
36
|
+
#run a dummy query to initiate a connection. Workaround for Bulk API problem
|
37
|
+
# https://github.com/yatish27/salesforce_bulk_api/issues/33
|
38
|
+
client.query('SELECT Id FROM Contact LIMIT 1')
|
39
|
+
client
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
def df_as_array_of_hashes
|
44
|
+
df.to_a[0]
|
45
|
+
end
|
46
|
+
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|