remi 0.2.27 → 0.2.28
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +3 -0
- data/Gemfile.lock +34 -5
- data/features/metadata.feature +17 -0
- data/features/step_definitions/remi_step.rb +6 -6
- data/features/transforms/date_diff.feature +1 -0
- data/jobs/aggregate_job.rb +0 -1
- data/jobs/all_jobs_shared.rb +0 -2
- data/jobs/copy_source_job.rb +0 -1
- data/jobs/csv_file_target_job.rb +0 -1
- data/jobs/metadata_job.rb +60 -0
- data/jobs/parameters_job.rb +1 -1
- data/jobs/sample_job.rb +19 -20
- data/jobs/sftp_file_target_job.rb +0 -1
- data/jobs/transforms/date_diff_job.rb +1 -1
- data/jobs/transforms/nvl_job.rb +1 -1
- data/jobs/transforms/parse_date_job.rb +7 -4
- data/jobs/transforms/prefix_job.rb +1 -1
- data/jobs/transforms/truncate_job.rb +1 -1
- data/lib/remi.rb +10 -15
- data/lib/remi/cucumber/business_rules.rb +23 -23
- data/lib/remi/cucumber/data_source.rb +2 -1
- data/lib/remi/data_frame.rb +36 -0
- data/lib/remi/data_frame/daru.rb +67 -0
- data/lib/remi/data_subject.rb +71 -10
- data/lib/remi/data_subject/csv_file.rb +151 -0
- data/lib/remi/data_subject/data_frame.rb +53 -0
- data/lib/remi/data_subject/postgres.rb +136 -0
- data/lib/remi/data_subject/salesforce.rb +136 -0
- data/lib/remi/data_subject/sftp_file.rb +66 -0
- data/lib/remi/fields.rb +8 -0
- data/lib/remi/source_to_target_map.rb +56 -32
- data/lib/remi/transform.rb +426 -83
- data/lib/remi/version.rb +1 -1
- data/remi.gemspec +2 -1
- data/spec/metadata_spec.rb +62 -0
- metadata +15 -28
- data/lib/remi/data_source.rb +0 -13
- data/lib/remi/data_source/csv_file.rb +0 -101
- data/lib/remi/data_source/data_frame.rb +0 -16
- data/lib/remi/data_source/postgres.rb +0 -58
- data/lib/remi/data_source/salesforce.rb +0 -87
- data/lib/remi/data_target.rb +0 -15
- data/lib/remi/data_target/csv_file.rb +0 -42
- data/lib/remi/data_target/data_frame.rb +0 -14
- data/lib/remi/data_target/postgres.rb +0 -74
- data/lib/remi/data_target/salesforce.rb +0 -54
- data/lib/remi/data_target/sftp_file.rb +0 -54
- data/lib/remi/refinements/daru.rb +0 -85
data/lib/remi/version.rb
CHANGED
data/remi.gemspec
CHANGED
@@ -13,7 +13,8 @@ Gem::Specification.new do |s|
|
|
13
13
|
s.description = "Data manipulation and ETL in Ruby"
|
14
14
|
|
15
15
|
s.rubyforge_project = "Remi"
|
16
|
-
|
16
|
+
# Making use of a fork for now
|
17
|
+
# s.add_runtime_dependency "daru", ["0.1.2"]
|
17
18
|
|
18
19
|
s.add_runtime_dependency 'bond', ['~> 0.5']
|
19
20
|
s.add_runtime_dependency 'docile', ['~> 1.1']
|
@@ -0,0 +1,62 @@
|
|
1
|
+
=begin
|
2
|
+
|
3
|
+
this is probably mostly about data subjects
|
4
|
+
|
5
|
+
|
6
|
+
calling fields on the data subject should return only the fields defined, even if there
|
7
|
+
are additional fields on the dataframe
|
8
|
+
|
9
|
+
dataframe metadata is merged into field metadata, always with a preference for field metadata
|
10
|
+
|
11
|
+
metadata propagates through 1:1 STTMs
|
12
|
+
|
13
|
+
metadata propagates through intermediate dataframes that are not data subjects
|
14
|
+
|
15
|
+
|
16
|
+
puts '---- ROUTE 1 - direct -----'
|
17
|
+
out_activity.df = Remi::DataFrame.create(:daru, [], order: out_activity.fields.keys)#, index: in_activity.df.index)
|
18
|
+
Remi::SourceToTargetMap.apply(in_activity.df, out_activity.df) do
|
19
|
+
# map source(:activity_id, :student_id) .target(:activity_id, :student_id)
|
20
|
+
map source(:activity_id) .target(:activity_id)
|
21
|
+
.transform(->(v) { "-#{v}-" })
|
22
|
+
# enforce types needs to be based on the "fields" for the target
|
23
|
+
# I might have to convert any Daru Dataframe to Remi dataframes in the STTM
|
24
|
+
|
25
|
+
# .transform(Remi::Transform[:enforce_types].(on_error: :ignore))
|
26
|
+
|
27
|
+
map source(:student_id) .target(:student_id)
|
28
|
+
map source(:student_dob) .target(:student_dob)
|
29
|
+
end
|
30
|
+
|
31
|
+
|
32
|
+
puts "out_activity.fields: #{out_activity.fields}"
|
33
|
+
puts "out_activity.df metadata: #{out_activity.df.vector_metadata}"
|
34
|
+
out_activity.df = out_activity.df[*(out_activity.fields.keys)]
|
35
|
+
puts "out_activity.fields: #{out_activity.fields}"
|
36
|
+
puts "out_activity.df metadata: #{out_activity.df.vector_metadata}"
|
37
|
+
IRuby.display out_activity.df, type: 'text/html'
|
38
|
+
|
39
|
+
|
40
|
+
puts '---- ROUTE 2 - via work_df -----'
|
41
|
+
work_df = Remi::DataFrame.create(:daru, [], order: out_activity.fields.keys)#, index: in_activity.df.index)
|
42
|
+
Remi::SourceToTargetMap.apply(in_activity.df, work_df) do
|
43
|
+
map source(:activity_id) .target(:activity_id)
|
44
|
+
# .transform(Remi::Transform[:enforce_types].(on_error: :ignore))
|
45
|
+
|
46
|
+
map source(:student_id) .target(:student_id)
|
47
|
+
map source(:student_dob) .target(:student_dob)
|
48
|
+
end
|
49
|
+
|
50
|
+
IRuby.display work_df, type: 'text/html'
|
51
|
+
puts "work_df metadata: #{work_df.vector_metadata}"
|
52
|
+
|
53
|
+
puts "out_activity.fields metadata: #{out_activity.fields}"
|
54
|
+
puts "out_activity.df metadata: #{out_activity.df.vector_metadata}"
|
55
|
+
puts "work_df is a #{work_df.class}"
|
56
|
+
out_activity.df = work_df#[*out_activity.fields.keys]
|
57
|
+
puts "out_activity.fields metadata: #{out_activity.fields}"
|
58
|
+
puts "out_activity.df metadata: #{out_activity.df.vector_metadata}"
|
59
|
+
IRuby.display out_activity.df, type: 'text/html'
|
60
|
+
|
61
|
+
|
62
|
+
=end
|
metadata
CHANGED
@@ -1,29 +1,15 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: remi
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.28
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sterling Paramore
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-04-
|
11
|
+
date: 2016-04-28 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
|
-
- !ruby/object:Gem::Dependency
|
14
|
-
name: daru
|
15
|
-
requirement: !ruby/object:Gem::Requirement
|
16
|
-
requirements:
|
17
|
-
- - '='
|
18
|
-
- !ruby/object:Gem::Version
|
19
|
-
version: 0.1.2
|
20
|
-
type: :runtime
|
21
|
-
prerelease: false
|
22
|
-
version_requirements: !ruby/object:Gem::Requirement
|
23
|
-
requirements:
|
24
|
-
- - '='
|
25
|
-
- !ruby/object:Gem::Version
|
26
|
-
version: 0.1.2
|
27
13
|
- !ruby/object:Gem::Dependency
|
28
14
|
name: bond
|
29
15
|
requirement: !ruby/object:Gem::Requirement
|
@@ -200,6 +186,7 @@ files:
|
|
200
186
|
- features/csv_file_target_job.feature
|
201
187
|
- features/examples.feature
|
202
188
|
- features/formulas.feature
|
189
|
+
- features/metadata.feature
|
203
190
|
- features/parameters.feature
|
204
191
|
- features/sample_job.feature
|
205
192
|
- features/sftp_file_target_job.feature
|
@@ -215,6 +202,7 @@ files:
|
|
215
202
|
- jobs/all_jobs_shared.rb
|
216
203
|
- jobs/copy_source_job.rb
|
217
204
|
- jobs/csv_file_target_job.rb
|
205
|
+
- jobs/metadata_job.rb
|
218
206
|
- jobs/parameters_job.rb
|
219
207
|
- jobs/sample_job.rb
|
220
208
|
- jobs/sftp_file_target_job.rb
|
@@ -229,22 +217,18 @@ files:
|
|
229
217
|
- lib/remi/cucumber.rb
|
230
218
|
- lib/remi/cucumber/business_rules.rb
|
231
219
|
- lib/remi/cucumber/data_source.rb
|
232
|
-
- lib/remi/
|
233
|
-
- lib/remi/
|
234
|
-
- lib/remi/data_source/data_frame.rb
|
235
|
-
- lib/remi/data_source/postgres.rb
|
236
|
-
- lib/remi/data_source/salesforce.rb
|
220
|
+
- lib/remi/data_frame.rb
|
221
|
+
- lib/remi/data_frame/daru.rb
|
237
222
|
- lib/remi/data_subject.rb
|
238
|
-
- lib/remi/
|
239
|
-
- lib/remi/
|
240
|
-
- lib/remi/
|
241
|
-
- lib/remi/
|
242
|
-
- lib/remi/
|
243
|
-
- lib/remi/data_target/sftp_file.rb
|
223
|
+
- lib/remi/data_subject/csv_file.rb
|
224
|
+
- lib/remi/data_subject/data_frame.rb
|
225
|
+
- lib/remi/data_subject/postgres.rb
|
226
|
+
- lib/remi/data_subject/salesforce.rb
|
227
|
+
- lib/remi/data_subject/sftp_file.rb
|
244
228
|
- lib/remi/extractor/sftp_file.rb
|
245
229
|
- lib/remi/field_symbolizers.rb
|
230
|
+
- lib/remi/fields.rb
|
246
231
|
- lib/remi/job.rb
|
247
|
-
- lib/remi/refinements/daru.rb
|
248
232
|
- lib/remi/refinements/symbolizer.rb
|
249
233
|
- lib/remi/settings.rb
|
250
234
|
- lib/remi/sf_bulk_helper.rb
|
@@ -253,6 +237,7 @@ files:
|
|
253
237
|
- lib/remi/version.rb
|
254
238
|
- remi.gemspec
|
255
239
|
- spec/extractor/sftp_file_spec.rb
|
240
|
+
- spec/metadata_spec.rb
|
256
241
|
- spec/remi_spec.rb
|
257
242
|
- workbooks/sample_workbook.ipynb
|
258
243
|
- workbooks/workbook_helper.rb
|
@@ -285,6 +270,7 @@ test_files:
|
|
285
270
|
- features/csv_file_target_job.feature
|
286
271
|
- features/examples.feature
|
287
272
|
- features/formulas.feature
|
273
|
+
- features/metadata.feature
|
288
274
|
- features/parameters.feature
|
289
275
|
- features/sample_job.feature
|
290
276
|
- features/sftp_file_target_job.feature
|
@@ -297,4 +283,5 @@ test_files:
|
|
297
283
|
- features/transforms/prefix.feature
|
298
284
|
- features/transforms/truncate.feature
|
299
285
|
- spec/extractor/sftp_file_spec.rb
|
286
|
+
- spec/metadata_spec.rb
|
300
287
|
- spec/remi_spec.rb
|
data/lib/remi/data_source.rb
DELETED
@@ -1,101 +0,0 @@
|
|
1
|
-
module Remi
|
2
|
-
module DataSource
|
3
|
-
class CsvFile
|
4
|
-
include DataSource
|
5
|
-
|
6
|
-
using Remi::Refinements::Daru
|
7
|
-
|
8
|
-
def self.default_csv_options
|
9
|
-
CSV::DEFAULT_OPTIONS.merge({
|
10
|
-
headers: true,
|
11
|
-
header_converters: Remi::FieldSymbolizers[:standard],
|
12
|
-
converters: [],
|
13
|
-
col_sep: ',',
|
14
|
-
encoding: 'UTF-8',
|
15
|
-
quote_char: '"'
|
16
|
-
})
|
17
|
-
end
|
18
|
-
|
19
|
-
|
20
|
-
def initialize(fields: {}, extractor:, csv_options: {}, filename_field: nil, logger: Remi::Settings.logger)
|
21
|
-
@fields = fields
|
22
|
-
self.extractor = extractor
|
23
|
-
@csv_options = self.class.default_csv_options.merge(csv_options)
|
24
|
-
@filename_field = filename_field
|
25
|
-
@logger = logger
|
26
|
-
end
|
27
|
-
|
28
|
-
attr_accessor :fields
|
29
|
-
attr_reader :extractor
|
30
|
-
attr_reader :csv_options
|
31
|
-
|
32
|
-
def field_symbolizer
|
33
|
-
self.class.default_csv_options[:header_converters]
|
34
|
-
end
|
35
|
-
|
36
|
-
def extract
|
37
|
-
@extracted = Array(@extractor.extract)
|
38
|
-
end
|
39
|
-
|
40
|
-
def extracted
|
41
|
-
@extracted || extract
|
42
|
-
end
|
43
|
-
|
44
|
-
def extractor=(arg)
|
45
|
-
case arg
|
46
|
-
when Extractor::SftpFile, Extractor::LocalFile
|
47
|
-
@extractor = arg
|
48
|
-
when String
|
49
|
-
@extractor = Extractor::LocalFile.new(path: arg)
|
50
|
-
when Regexp
|
51
|
-
raise "Adding regex matching to local files would be easy, not done yet"
|
52
|
-
else
|
53
|
-
raise "Unknown extractor of type #{arg.class}: #{arg}"
|
54
|
-
end
|
55
|
-
end
|
56
|
-
|
57
|
-
# Only going to support single file for now
|
58
|
-
def source_filename
|
59
|
-
raise "Multiple source files detected" if extracted.size > 1
|
60
|
-
@source_filename ||= extracted.first
|
61
|
-
end
|
62
|
-
|
63
|
-
def first_line
|
64
|
-
# Readline assumes \n line endings. Strip out \r if it is a DOS file.
|
65
|
-
@first_line ||= File.open(source_filename) do |f|
|
66
|
-
f.readline.gsub(/\r/,'')
|
67
|
-
end
|
68
|
-
end
|
69
|
-
|
70
|
-
def headers
|
71
|
-
@headers ||= CSV.open(source_filename, 'r', source_csv_options) { |csv| csv.first }.headers
|
72
|
-
end
|
73
|
-
|
74
|
-
def valid_headers?
|
75
|
-
(fields.keys - headers).empty?
|
76
|
-
end
|
77
|
-
|
78
|
-
def to_dataframe
|
79
|
-
# Assumes that each file has exactly the same structure
|
80
|
-
result_df = nil
|
81
|
-
extracted.each_with_index do |filename, idx|
|
82
|
-
@logger.info "Converting #{filename} to a dataframe"
|
83
|
-
csv_df = Daru::DataFrame.from_csv filename, @csv_options
|
84
|
-
|
85
|
-
csv_df[@filename_field] = Daru::Vector.new([filename] * csv_df.size, index: csv_df.index) if @filename_field
|
86
|
-
if idx == 0
|
87
|
-
result_df = csv_df
|
88
|
-
else
|
89
|
-
result_df = result_df.concat csv_df
|
90
|
-
end
|
91
|
-
end
|
92
|
-
|
93
|
-
result_df
|
94
|
-
end
|
95
|
-
|
96
|
-
def df
|
97
|
-
@dataframe ||= to_dataframe
|
98
|
-
end
|
99
|
-
end
|
100
|
-
end
|
101
|
-
end
|
@@ -1,58 +0,0 @@
|
|
1
|
-
module Remi
|
2
|
-
module DataSource
|
3
|
-
class Postgres
|
4
|
-
include DataSource
|
5
|
-
|
6
|
-
def initialize(fields: {}, credentials:, query:, logger: Remi::Settings.logger)
|
7
|
-
@fields = fields
|
8
|
-
@credentials = credentials
|
9
|
-
@query = query
|
10
|
-
@logger = logger
|
11
|
-
end
|
12
|
-
|
13
|
-
attr_accessor :fields
|
14
|
-
|
15
|
-
def extract
|
16
|
-
@logger.info "Executing query #{@query}"
|
17
|
-
@raw_result = connection.exec @query
|
18
|
-
end
|
19
|
-
|
20
|
-
def raw_result
|
21
|
-
@raw_result ||= extract
|
22
|
-
end
|
23
|
-
|
24
|
-
def connection
|
25
|
-
@connection ||= PG.connect(
|
26
|
-
host: @credentials[:host] || 'localhost',
|
27
|
-
port: @credentials[:port] || 5432,
|
28
|
-
dbname: @credentials[:dbname],
|
29
|
-
user: @credentials[:user] || `whoami`.chomp,
|
30
|
-
password: @credentials[:password],
|
31
|
-
sslmode: @credentials[:sslmode] || 'allow'
|
32
|
-
)
|
33
|
-
end
|
34
|
-
|
35
|
-
|
36
|
-
def to_dataframe
|
37
|
-
# Performance for larger sets could be improved by using bulk query (via COPY)
|
38
|
-
@logger.info "Converting query to a dataframe"
|
39
|
-
|
40
|
-
hash_array = {}
|
41
|
-
raw_result.each do |row|
|
42
|
-
row.each do |field, value|
|
43
|
-
(hash_array[field_symbolizer.call(field)] ||= []) << value
|
44
|
-
end
|
45
|
-
end
|
46
|
-
|
47
|
-
# After converting to DF, clear the PG results to save memory.
|
48
|
-
raw_result.clear
|
49
|
-
|
50
|
-
Daru::DataFrame.new hash_array, order: hash_array.keys
|
51
|
-
end
|
52
|
-
|
53
|
-
def df
|
54
|
-
@dataframe ||= to_dataframe
|
55
|
-
end
|
56
|
-
end
|
57
|
-
end
|
58
|
-
end
|
@@ -1,87 +0,0 @@
|
|
1
|
-
module Remi
|
2
|
-
module DataSource
|
3
|
-
class Salesforce
|
4
|
-
include DataSource
|
5
|
-
|
6
|
-
def initialize(fields: {}, object:, query:, credentials:, api: :bulk, logger: Remi::Settings.logger)
|
7
|
-
@fields = fields
|
8
|
-
@sfo = object
|
9
|
-
@query = query
|
10
|
-
@credentials = credentials
|
11
|
-
@api = api
|
12
|
-
@logger = logger
|
13
|
-
end
|
14
|
-
|
15
|
-
attr_accessor :fields
|
16
|
-
attr_accessor :raw_result
|
17
|
-
|
18
|
-
def field_symbolizer
|
19
|
-
Remi::FieldSymbolizers[:salesforce]
|
20
|
-
end
|
21
|
-
|
22
|
-
def extract
|
23
|
-
@raw_result = sf_bulk.query(@sfo, @query, 10000)
|
24
|
-
|
25
|
-
check_for_errors(@raw_result)
|
26
|
-
@raw_result
|
27
|
-
end
|
28
|
-
|
29
|
-
def check_for_errors(sf_result)
|
30
|
-
sf_result['batches'].each do |batch|
|
31
|
-
raise "Error with batch #{batch['id']} - #{batch['state']}: #{batch['stateMessage']}" unless batch['state'].first == 'Completed'
|
32
|
-
end
|
33
|
-
end
|
34
|
-
|
35
|
-
def raw_result
|
36
|
-
@raw_result ||= extract
|
37
|
-
end
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
def restforce_client
|
42
|
-
@restforce_client ||= begin
|
43
|
-
client = Restforce.new(@credentials)
|
44
|
-
|
45
|
-
#run a dummy query to initiate a connection. Workaround for Bulk API problem
|
46
|
-
# https://github.com/yatish27/salesforce_bulk_api/issues/33
|
47
|
-
client.query('SELECT Id FROM Contact LIMIT 1')
|
48
|
-
client
|
49
|
-
end
|
50
|
-
end
|
51
|
-
|
52
|
-
def sf_bulk
|
53
|
-
@sf_bulk ||= SalesforceBulkApi::Api.new(restforce_client).tap { |o| o.connection.set_status_throttle(5) }
|
54
|
-
end
|
55
|
-
|
56
|
-
def to_dataframe
|
57
|
-
@logger.info "Converting salesforce query results to a dataframe"
|
58
|
-
|
59
|
-
hash_array = {}
|
60
|
-
raw_result['batches'].each do |batch|
|
61
|
-
next unless batch['response']
|
62
|
-
|
63
|
-
batch['response'].each do |record|
|
64
|
-
record.each do |field, value|
|
65
|
-
next if ['xsi:type','type'].include? field
|
66
|
-
(hash_array[field.to_sym] ||= []) << case value.first
|
67
|
-
when Hash
|
68
|
-
value.first["xsi:nil"] == "true" ? nil : value.first
|
69
|
-
else
|
70
|
-
value.first
|
71
|
-
end
|
72
|
-
end
|
73
|
-
end
|
74
|
-
|
75
|
-
# delete raw result at end of processing to free memory
|
76
|
-
batch['response'] = nil
|
77
|
-
end
|
78
|
-
|
79
|
-
Daru::DataFrame.new hash_array, order: hash_array.keys
|
80
|
-
end
|
81
|
-
|
82
|
-
def df
|
83
|
-
@dataframe ||= to_dataframe
|
84
|
-
end
|
85
|
-
end
|
86
|
-
end
|
87
|
-
end
|