remi 0.2.27 → 0.2.28
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile +3 -0
- data/Gemfile.lock +34 -5
- data/features/metadata.feature +17 -0
- data/features/step_definitions/remi_step.rb +6 -6
- data/features/transforms/date_diff.feature +1 -0
- data/jobs/aggregate_job.rb +0 -1
- data/jobs/all_jobs_shared.rb +0 -2
- data/jobs/copy_source_job.rb +0 -1
- data/jobs/csv_file_target_job.rb +0 -1
- data/jobs/metadata_job.rb +60 -0
- data/jobs/parameters_job.rb +1 -1
- data/jobs/sample_job.rb +19 -20
- data/jobs/sftp_file_target_job.rb +0 -1
- data/jobs/transforms/date_diff_job.rb +1 -1
- data/jobs/transforms/nvl_job.rb +1 -1
- data/jobs/transforms/parse_date_job.rb +7 -4
- data/jobs/transforms/prefix_job.rb +1 -1
- data/jobs/transforms/truncate_job.rb +1 -1
- data/lib/remi.rb +10 -15
- data/lib/remi/cucumber/business_rules.rb +23 -23
- data/lib/remi/cucumber/data_source.rb +2 -1
- data/lib/remi/data_frame.rb +36 -0
- data/lib/remi/data_frame/daru.rb +67 -0
- data/lib/remi/data_subject.rb +71 -10
- data/lib/remi/data_subject/csv_file.rb +151 -0
- data/lib/remi/data_subject/data_frame.rb +53 -0
- data/lib/remi/data_subject/postgres.rb +136 -0
- data/lib/remi/data_subject/salesforce.rb +136 -0
- data/lib/remi/data_subject/sftp_file.rb +66 -0
- data/lib/remi/fields.rb +8 -0
- data/lib/remi/source_to_target_map.rb +56 -32
- data/lib/remi/transform.rb +426 -83
- data/lib/remi/version.rb +1 -1
- data/remi.gemspec +2 -1
- data/spec/metadata_spec.rb +62 -0
- metadata +15 -28
- data/lib/remi/data_source.rb +0 -13
- data/lib/remi/data_source/csv_file.rb +0 -101
- data/lib/remi/data_source/data_frame.rb +0 -16
- data/lib/remi/data_source/postgres.rb +0 -58
- data/lib/remi/data_source/salesforce.rb +0 -87
- data/lib/remi/data_target.rb +0 -15
- data/lib/remi/data_target/csv_file.rb +0 -42
- data/lib/remi/data_target/data_frame.rb +0 -14
- data/lib/remi/data_target/postgres.rb +0 -74
- data/lib/remi/data_target/salesforce.rb +0 -54
- data/lib/remi/data_target/sftp_file.rb +0 -54
- data/lib/remi/refinements/daru.rb +0 -85
data/lib/remi/version.rb
CHANGED
data/remi.gemspec
CHANGED
@@ -13,7 +13,8 @@ Gem::Specification.new do |s|
|
|
13
13
|
s.description = "Data manipulation and ETL in Ruby"
|
14
14
|
|
15
15
|
s.rubyforge_project = "Remi"
|
16
|
-
|
16
|
+
# Making use of a fork for now
|
17
|
+
# s.add_runtime_dependency "daru", ["0.1.2"]
|
17
18
|
|
18
19
|
s.add_runtime_dependency 'bond', ['~> 0.5']
|
19
20
|
s.add_runtime_dependency 'docile', ['~> 1.1']
|
@@ -0,0 +1,62 @@
|
|
1
|
+
=begin
|
2
|
+
|
3
|
+
this is probably mostly about data subjects
|
4
|
+
|
5
|
+
|
6
|
+
calling fields on the data subject should return only the fields defined, even if there
|
7
|
+
are additional fields on the dataframe
|
8
|
+
|
9
|
+
dataframe metadata is merged into field metadata, always with a preference for field metadata
|
10
|
+
|
11
|
+
metadata propagates through 1:1 STTMs
|
12
|
+
|
13
|
+
metadata propagates through intermediate dataframes that are not data subjects
|
14
|
+
|
15
|
+
|
16
|
+
puts '---- ROUTE 1 - direct -----'
|
17
|
+
out_activity.df = Remi::DataFrame.create(:daru, [], order: out_activity.fields.keys)#, index: in_activity.df.index)
|
18
|
+
Remi::SourceToTargetMap.apply(in_activity.df, out_activity.df) do
|
19
|
+
# map source(:activity_id, :student_id) .target(:activity_id, :student_id)
|
20
|
+
map source(:activity_id) .target(:activity_id)
|
21
|
+
.transform(->(v) { "-#{v}-" })
|
22
|
+
# enforce types needs to be based on the "fields" for the target
|
23
|
+
# I might have to convert any Daru Dataframe to Remi dataframes in the STTM
|
24
|
+
|
25
|
+
# .transform(Remi::Transform[:enforce_types].(on_error: :ignore))
|
26
|
+
|
27
|
+
map source(:student_id) .target(:student_id)
|
28
|
+
map source(:student_dob) .target(:student_dob)
|
29
|
+
end
|
30
|
+
|
31
|
+
|
32
|
+
puts "out_activity.fields: #{out_activity.fields}"
|
33
|
+
puts "out_activity.df metadata: #{out_activity.df.vector_metadata}"
|
34
|
+
out_activity.df = out_activity.df[*(out_activity.fields.keys)]
|
35
|
+
puts "out_activity.fields: #{out_activity.fields}"
|
36
|
+
puts "out_activity.df metadata: #{out_activity.df.vector_metadata}"
|
37
|
+
IRuby.display out_activity.df, type: 'text/html'
|
38
|
+
|
39
|
+
|
40
|
+
puts '---- ROUTE 2 - via work_df -----'
|
41
|
+
work_df = Remi::DataFrame.create(:daru, [], order: out_activity.fields.keys)#, index: in_activity.df.index)
|
42
|
+
Remi::SourceToTargetMap.apply(in_activity.df, work_df) do
|
43
|
+
map source(:activity_id) .target(:activity_id)
|
44
|
+
# .transform(Remi::Transform[:enforce_types].(on_error: :ignore))
|
45
|
+
|
46
|
+
map source(:student_id) .target(:student_id)
|
47
|
+
map source(:student_dob) .target(:student_dob)
|
48
|
+
end
|
49
|
+
|
50
|
+
IRuby.display work_df, type: 'text/html'
|
51
|
+
puts "work_df metadata: #{work_df.vector_metadata}"
|
52
|
+
|
53
|
+
puts "out_activity.fields metadata: #{out_activity.fields}"
|
54
|
+
puts "out_activity.df metadata: #{out_activity.df.vector_metadata}"
|
55
|
+
puts "work_df is a #{work_df.class}"
|
56
|
+
out_activity.df = work_df#[*out_activity.fields.keys]
|
57
|
+
puts "out_activity.fields metadata: #{out_activity.fields}"
|
58
|
+
puts "out_activity.df metadata: #{out_activity.df.vector_metadata}"
|
59
|
+
IRuby.display out_activity.df, type: 'text/html'
|
60
|
+
|
61
|
+
|
62
|
+
=end
|
metadata
CHANGED
@@ -1,29 +1,15 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: remi
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.28
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sterling Paramore
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-04-
|
11
|
+
date: 2016-04-28 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
|
-
- !ruby/object:Gem::Dependency
|
14
|
-
name: daru
|
15
|
-
requirement: !ruby/object:Gem::Requirement
|
16
|
-
requirements:
|
17
|
-
- - '='
|
18
|
-
- !ruby/object:Gem::Version
|
19
|
-
version: 0.1.2
|
20
|
-
type: :runtime
|
21
|
-
prerelease: false
|
22
|
-
version_requirements: !ruby/object:Gem::Requirement
|
23
|
-
requirements:
|
24
|
-
- - '='
|
25
|
-
- !ruby/object:Gem::Version
|
26
|
-
version: 0.1.2
|
27
13
|
- !ruby/object:Gem::Dependency
|
28
14
|
name: bond
|
29
15
|
requirement: !ruby/object:Gem::Requirement
|
@@ -200,6 +186,7 @@ files:
|
|
200
186
|
- features/csv_file_target_job.feature
|
201
187
|
- features/examples.feature
|
202
188
|
- features/formulas.feature
|
189
|
+
- features/metadata.feature
|
203
190
|
- features/parameters.feature
|
204
191
|
- features/sample_job.feature
|
205
192
|
- features/sftp_file_target_job.feature
|
@@ -215,6 +202,7 @@ files:
|
|
215
202
|
- jobs/all_jobs_shared.rb
|
216
203
|
- jobs/copy_source_job.rb
|
217
204
|
- jobs/csv_file_target_job.rb
|
205
|
+
- jobs/metadata_job.rb
|
218
206
|
- jobs/parameters_job.rb
|
219
207
|
- jobs/sample_job.rb
|
220
208
|
- jobs/sftp_file_target_job.rb
|
@@ -229,22 +217,18 @@ files:
|
|
229
217
|
- lib/remi/cucumber.rb
|
230
218
|
- lib/remi/cucumber/business_rules.rb
|
231
219
|
- lib/remi/cucumber/data_source.rb
|
232
|
-
- lib/remi/
|
233
|
-
- lib/remi/
|
234
|
-
- lib/remi/data_source/data_frame.rb
|
235
|
-
- lib/remi/data_source/postgres.rb
|
236
|
-
- lib/remi/data_source/salesforce.rb
|
220
|
+
- lib/remi/data_frame.rb
|
221
|
+
- lib/remi/data_frame/daru.rb
|
237
222
|
- lib/remi/data_subject.rb
|
238
|
-
- lib/remi/
|
239
|
-
- lib/remi/
|
240
|
-
- lib/remi/
|
241
|
-
- lib/remi/
|
242
|
-
- lib/remi/
|
243
|
-
- lib/remi/data_target/sftp_file.rb
|
223
|
+
- lib/remi/data_subject/csv_file.rb
|
224
|
+
- lib/remi/data_subject/data_frame.rb
|
225
|
+
- lib/remi/data_subject/postgres.rb
|
226
|
+
- lib/remi/data_subject/salesforce.rb
|
227
|
+
- lib/remi/data_subject/sftp_file.rb
|
244
228
|
- lib/remi/extractor/sftp_file.rb
|
245
229
|
- lib/remi/field_symbolizers.rb
|
230
|
+
- lib/remi/fields.rb
|
246
231
|
- lib/remi/job.rb
|
247
|
-
- lib/remi/refinements/daru.rb
|
248
232
|
- lib/remi/refinements/symbolizer.rb
|
249
233
|
- lib/remi/settings.rb
|
250
234
|
- lib/remi/sf_bulk_helper.rb
|
@@ -253,6 +237,7 @@ files:
|
|
253
237
|
- lib/remi/version.rb
|
254
238
|
- remi.gemspec
|
255
239
|
- spec/extractor/sftp_file_spec.rb
|
240
|
+
- spec/metadata_spec.rb
|
256
241
|
- spec/remi_spec.rb
|
257
242
|
- workbooks/sample_workbook.ipynb
|
258
243
|
- workbooks/workbook_helper.rb
|
@@ -285,6 +270,7 @@ test_files:
|
|
285
270
|
- features/csv_file_target_job.feature
|
286
271
|
- features/examples.feature
|
287
272
|
- features/formulas.feature
|
273
|
+
- features/metadata.feature
|
288
274
|
- features/parameters.feature
|
289
275
|
- features/sample_job.feature
|
290
276
|
- features/sftp_file_target_job.feature
|
@@ -297,4 +283,5 @@ test_files:
|
|
297
283
|
- features/transforms/prefix.feature
|
298
284
|
- features/transforms/truncate.feature
|
299
285
|
- spec/extractor/sftp_file_spec.rb
|
286
|
+
- spec/metadata_spec.rb
|
300
287
|
- spec/remi_spec.rb
|
data/lib/remi/data_source.rb
DELETED
@@ -1,101 +0,0 @@
|
|
1
|
-
module Remi
|
2
|
-
module DataSource
|
3
|
-
class CsvFile
|
4
|
-
include DataSource
|
5
|
-
|
6
|
-
using Remi::Refinements::Daru
|
7
|
-
|
8
|
-
def self.default_csv_options
|
9
|
-
CSV::DEFAULT_OPTIONS.merge({
|
10
|
-
headers: true,
|
11
|
-
header_converters: Remi::FieldSymbolizers[:standard],
|
12
|
-
converters: [],
|
13
|
-
col_sep: ',',
|
14
|
-
encoding: 'UTF-8',
|
15
|
-
quote_char: '"'
|
16
|
-
})
|
17
|
-
end
|
18
|
-
|
19
|
-
|
20
|
-
def initialize(fields: {}, extractor:, csv_options: {}, filename_field: nil, logger: Remi::Settings.logger)
|
21
|
-
@fields = fields
|
22
|
-
self.extractor = extractor
|
23
|
-
@csv_options = self.class.default_csv_options.merge(csv_options)
|
24
|
-
@filename_field = filename_field
|
25
|
-
@logger = logger
|
26
|
-
end
|
27
|
-
|
28
|
-
attr_accessor :fields
|
29
|
-
attr_reader :extractor
|
30
|
-
attr_reader :csv_options
|
31
|
-
|
32
|
-
def field_symbolizer
|
33
|
-
self.class.default_csv_options[:header_converters]
|
34
|
-
end
|
35
|
-
|
36
|
-
def extract
|
37
|
-
@extracted = Array(@extractor.extract)
|
38
|
-
end
|
39
|
-
|
40
|
-
def extracted
|
41
|
-
@extracted || extract
|
42
|
-
end
|
43
|
-
|
44
|
-
def extractor=(arg)
|
45
|
-
case arg
|
46
|
-
when Extractor::SftpFile, Extractor::LocalFile
|
47
|
-
@extractor = arg
|
48
|
-
when String
|
49
|
-
@extractor = Extractor::LocalFile.new(path: arg)
|
50
|
-
when Regexp
|
51
|
-
raise "Adding regex matching to local files would be easy, not done yet"
|
52
|
-
else
|
53
|
-
raise "Unknown extractor of type #{arg.class}: #{arg}"
|
54
|
-
end
|
55
|
-
end
|
56
|
-
|
57
|
-
# Only going to support single file for now
|
58
|
-
def source_filename
|
59
|
-
raise "Multiple source files detected" if extracted.size > 1
|
60
|
-
@source_filename ||= extracted.first
|
61
|
-
end
|
62
|
-
|
63
|
-
def first_line
|
64
|
-
# Readline assumes \n line endings. Strip out \r if it is a DOS file.
|
65
|
-
@first_line ||= File.open(source_filename) do |f|
|
66
|
-
f.readline.gsub(/\r/,'')
|
67
|
-
end
|
68
|
-
end
|
69
|
-
|
70
|
-
def headers
|
71
|
-
@headers ||= CSV.open(source_filename, 'r', source_csv_options) { |csv| csv.first }.headers
|
72
|
-
end
|
73
|
-
|
74
|
-
def valid_headers?
|
75
|
-
(fields.keys - headers).empty?
|
76
|
-
end
|
77
|
-
|
78
|
-
def to_dataframe
|
79
|
-
# Assumes that each file has exactly the same structure
|
80
|
-
result_df = nil
|
81
|
-
extracted.each_with_index do |filename, idx|
|
82
|
-
@logger.info "Converting #{filename} to a dataframe"
|
83
|
-
csv_df = Daru::DataFrame.from_csv filename, @csv_options
|
84
|
-
|
85
|
-
csv_df[@filename_field] = Daru::Vector.new([filename] * csv_df.size, index: csv_df.index) if @filename_field
|
86
|
-
if idx == 0
|
87
|
-
result_df = csv_df
|
88
|
-
else
|
89
|
-
result_df = result_df.concat csv_df
|
90
|
-
end
|
91
|
-
end
|
92
|
-
|
93
|
-
result_df
|
94
|
-
end
|
95
|
-
|
96
|
-
def df
|
97
|
-
@dataframe ||= to_dataframe
|
98
|
-
end
|
99
|
-
end
|
100
|
-
end
|
101
|
-
end
|
@@ -1,58 +0,0 @@
|
|
1
|
-
module Remi
|
2
|
-
module DataSource
|
3
|
-
class Postgres
|
4
|
-
include DataSource
|
5
|
-
|
6
|
-
def initialize(fields: {}, credentials:, query:, logger: Remi::Settings.logger)
|
7
|
-
@fields = fields
|
8
|
-
@credentials = credentials
|
9
|
-
@query = query
|
10
|
-
@logger = logger
|
11
|
-
end
|
12
|
-
|
13
|
-
attr_accessor :fields
|
14
|
-
|
15
|
-
def extract
|
16
|
-
@logger.info "Executing query #{@query}"
|
17
|
-
@raw_result = connection.exec @query
|
18
|
-
end
|
19
|
-
|
20
|
-
def raw_result
|
21
|
-
@raw_result ||= extract
|
22
|
-
end
|
23
|
-
|
24
|
-
def connection
|
25
|
-
@connection ||= PG.connect(
|
26
|
-
host: @credentials[:host] || 'localhost',
|
27
|
-
port: @credentials[:port] || 5432,
|
28
|
-
dbname: @credentials[:dbname],
|
29
|
-
user: @credentials[:user] || `whoami`.chomp,
|
30
|
-
password: @credentials[:password],
|
31
|
-
sslmode: @credentials[:sslmode] || 'allow'
|
32
|
-
)
|
33
|
-
end
|
34
|
-
|
35
|
-
|
36
|
-
def to_dataframe
|
37
|
-
# Performance for larger sets could be improved by using bulk query (via COPY)
|
38
|
-
@logger.info "Converting query to a dataframe"
|
39
|
-
|
40
|
-
hash_array = {}
|
41
|
-
raw_result.each do |row|
|
42
|
-
row.each do |field, value|
|
43
|
-
(hash_array[field_symbolizer.call(field)] ||= []) << value
|
44
|
-
end
|
45
|
-
end
|
46
|
-
|
47
|
-
# After converting to DF, clear the PG results to save memory.
|
48
|
-
raw_result.clear
|
49
|
-
|
50
|
-
Daru::DataFrame.new hash_array, order: hash_array.keys
|
51
|
-
end
|
52
|
-
|
53
|
-
def df
|
54
|
-
@dataframe ||= to_dataframe
|
55
|
-
end
|
56
|
-
end
|
57
|
-
end
|
58
|
-
end
|
@@ -1,87 +0,0 @@
|
|
1
|
-
module Remi
|
2
|
-
module DataSource
|
3
|
-
class Salesforce
|
4
|
-
include DataSource
|
5
|
-
|
6
|
-
def initialize(fields: {}, object:, query:, credentials:, api: :bulk, logger: Remi::Settings.logger)
|
7
|
-
@fields = fields
|
8
|
-
@sfo = object
|
9
|
-
@query = query
|
10
|
-
@credentials = credentials
|
11
|
-
@api = api
|
12
|
-
@logger = logger
|
13
|
-
end
|
14
|
-
|
15
|
-
attr_accessor :fields
|
16
|
-
attr_accessor :raw_result
|
17
|
-
|
18
|
-
def field_symbolizer
|
19
|
-
Remi::FieldSymbolizers[:salesforce]
|
20
|
-
end
|
21
|
-
|
22
|
-
def extract
|
23
|
-
@raw_result = sf_bulk.query(@sfo, @query, 10000)
|
24
|
-
|
25
|
-
check_for_errors(@raw_result)
|
26
|
-
@raw_result
|
27
|
-
end
|
28
|
-
|
29
|
-
def check_for_errors(sf_result)
|
30
|
-
sf_result['batches'].each do |batch|
|
31
|
-
raise "Error with batch #{batch['id']} - #{batch['state']}: #{batch['stateMessage']}" unless batch['state'].first == 'Completed'
|
32
|
-
end
|
33
|
-
end
|
34
|
-
|
35
|
-
def raw_result
|
36
|
-
@raw_result ||= extract
|
37
|
-
end
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
def restforce_client
|
42
|
-
@restforce_client ||= begin
|
43
|
-
client = Restforce.new(@credentials)
|
44
|
-
|
45
|
-
#run a dummy query to initiate a connection. Workaround for Bulk API problem
|
46
|
-
# https://github.com/yatish27/salesforce_bulk_api/issues/33
|
47
|
-
client.query('SELECT Id FROM Contact LIMIT 1')
|
48
|
-
client
|
49
|
-
end
|
50
|
-
end
|
51
|
-
|
52
|
-
def sf_bulk
|
53
|
-
@sf_bulk ||= SalesforceBulkApi::Api.new(restforce_client).tap { |o| o.connection.set_status_throttle(5) }
|
54
|
-
end
|
55
|
-
|
56
|
-
def to_dataframe
|
57
|
-
@logger.info "Converting salesforce query results to a dataframe"
|
58
|
-
|
59
|
-
hash_array = {}
|
60
|
-
raw_result['batches'].each do |batch|
|
61
|
-
next unless batch['response']
|
62
|
-
|
63
|
-
batch['response'].each do |record|
|
64
|
-
record.each do |field, value|
|
65
|
-
next if ['xsi:type','type'].include? field
|
66
|
-
(hash_array[field.to_sym] ||= []) << case value.first
|
67
|
-
when Hash
|
68
|
-
value.first["xsi:nil"] == "true" ? nil : value.first
|
69
|
-
else
|
70
|
-
value.first
|
71
|
-
end
|
72
|
-
end
|
73
|
-
end
|
74
|
-
|
75
|
-
# delete raw result at end of processing to free memory
|
76
|
-
batch['response'] = nil
|
77
|
-
end
|
78
|
-
|
79
|
-
Daru::DataFrame.new hash_array, order: hash_array.keys
|
80
|
-
end
|
81
|
-
|
82
|
-
def df
|
83
|
-
@dataframe ||= to_dataframe
|
84
|
-
end
|
85
|
-
end
|
86
|
-
end
|
87
|
-
end
|