remi 0.0.1 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. checksums.yaml +4 -4
  2. data/.bundle/config +2 -0
  3. data/.gitignore +3 -2
  4. data/.rspec +2 -0
  5. data/.ruby-version +1 -0
  6. data/Gemfile +4 -0
  7. data/Gemfile.lock +123 -0
  8. data/LICENSE.txt +21 -0
  9. data/README.md +94 -3
  10. data/bin/remi +8 -0
  11. data/doc/install-rbenv-os_x.md +47 -0
  12. data/lib/remi.rb +56 -9
  13. data/lib/remi/cli.rb +56 -0
  14. data/lib/remi/core/daru.rb +28 -0
  15. data/lib/remi/core/refinements.rb +21 -0
  16. data/lib/remi/core/string.rb +8 -0
  17. data/lib/remi/cucumber.rb +7 -0
  18. data/lib/remi/cucumber/business_rules.rb +504 -0
  19. data/lib/remi/cucumber/data_source.rb +63 -0
  20. data/lib/remi/data_source.rb +13 -0
  21. data/lib/remi/data_source/csv_file.rb +79 -0
  22. data/lib/remi/data_source/data_frame.rb +10 -0
  23. data/lib/remi/data_source/postgres.rb +58 -0
  24. data/lib/remi/data_source/salesforce.rb +78 -0
  25. data/lib/remi/data_subject.rb +25 -0
  26. data/lib/remi/data_target.rb +15 -0
  27. data/lib/remi/data_target/csv_file.rb +49 -0
  28. data/lib/remi/data_target/data_frame.rb +14 -0
  29. data/lib/remi/data_target/salesforce.rb +49 -0
  30. data/lib/remi/extractor/sftp_file.rb +84 -0
  31. data/lib/remi/field_symbolizers.rb +17 -0
  32. data/lib/remi/job.rb +200 -0
  33. data/lib/remi/lookup/regex_sieve.rb +55 -0
  34. data/lib/remi/project/features/examples.feature +24 -0
  35. data/lib/remi/project/features/formulas.feature +64 -0
  36. data/lib/remi/project/features/sample_job.feature +304 -0
  37. data/lib/remi/project/features/step_definitions/remi_step.rb +310 -0
  38. data/lib/remi/project/features/support/env.rb +10 -0
  39. data/lib/remi/project/features/support/env_app.rb +3 -0
  40. data/lib/remi/project/features/transforms/date_diff.feature +50 -0
  41. data/lib/remi/project/features/transforms/parse_date.feature +34 -0
  42. data/lib/remi/project/features/transforms/prefix.feature +15 -0
  43. data/lib/remi/project/jobs/all_jobs_shared.rb +25 -0
  44. data/lib/remi/project/jobs/copy_source_job.rb +12 -0
  45. data/lib/remi/project/jobs/sample_job.rb +164 -0
  46. data/lib/remi/project/jobs/transforms/date_diff_job.rb +17 -0
  47. data/lib/remi/project/jobs/transforms/parse_date_job.rb +18 -0
  48. data/lib/remi/project/jobs/transforms/prefix_job.rb +16 -0
  49. data/lib/remi/project/jobs/transforms/transform_jobs.rb +3 -0
  50. data/lib/remi/settings.rb +39 -0
  51. data/lib/remi/sf_bulk_helper.rb +265 -0
  52. data/lib/remi/source_to_target_map.rb +93 -0
  53. data/lib/remi/transform.rb +137 -0
  54. data/lib/remi/version.rb +3 -0
  55. data/remi.gemspec +25 -7
  56. data/workbooks/sample_workbook.ipynb +56 -0
  57. data/workbooks/workbook_helper.rb +1 -0
  58. metadata +234 -17
  59. data/lib/noodling.rb +0 -163
  60. data/test/test_NAME.rb +0 -19
@@ -0,0 +1,63 @@
1
+ module Remi
2
+ module DataSource
3
+ module DataStub
4
+ def stub_row_array
5
+ @fields.values.map do |attrib|
6
+ case attrib[:type]
7
+ when :date
8
+ stub_values[:date].strftime(attrib[:format])
9
+ when nil
10
+ stub_values[:string]
11
+ else
12
+ stub_values[attrib[:type]]
13
+ end
14
+ end
15
+ end
16
+
17
+ def stub_df
18
+ wdf = Daru::DataFrame.new([], order: @fields.keys)
19
+ wdf.add_row(stub_row_array)
20
+ self.df = wdf
21
+ end
22
+
23
+ def stub_values
24
+ @stub_values ||= {
25
+ string: "Some String",
26
+ number: 133,
27
+ float: 3.14159,
28
+ integer: 38,
29
+ date: Date.parse('2015-10-21')
30
+ }
31
+ end
32
+ end
33
+
34
+
35
+ class CsvFile
36
+ include DataStub
37
+ def stub_tmp_file
38
+ @stub_tmp_file ||= Tempfile.new('stub_tmp_file.csv').path
39
+ end
40
+
41
+ def write_stub_tmp_file
42
+ File.open(stub_tmp_file, "wb") do |file|
43
+ file.puts stub_header
44
+ file.puts stub_row_csv
45
+ end
46
+
47
+ stub_tmp_file
48
+ end
49
+
50
+ def stub_header
51
+ @fields.keys.join(@csv_options[:col_sep])
52
+ end
53
+
54
+ def stub_row_csv
55
+ stub_row_array.join(@csv_options[:col_sep])
56
+ end
57
+ end
58
+
59
+ class Salesforce
60
+ include DataStub
61
+ end
62
+ end
63
+ end
@@ -0,0 +1,13 @@
1
+ module Remi
2
+ module DataSource
3
+ include DataSubject
4
+
5
+ def extract
6
+ raise "Extract function undefined for #{self.class.name}"
7
+ end
8
+
9
+ def feild_symbolizer
10
+ Remi::FieldSymbolizers[:standard]
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,79 @@
1
+ module Remi
2
+ module DataSource
3
+ class CsvFile
4
+ include DataSource
5
+
6
+ def self.default_csv_options
7
+ CSV::DEFAULT_OPTIONS.merge({
8
+ headers: true,
9
+ header_converters: Remi::FieldSymbolizers[:standard],
10
+ col_sep: ',',
11
+ encoding: 'UTF-8',
12
+ quote_char: '"'
13
+ })
14
+ end
15
+
16
+
17
+ def initialize(fields: {}, extractor:, csv_options: {}, logger: Remi::Settings.logger)
18
+ @fields = fields
19
+ self.extractor = extractor
20
+ @csv_options = self.class.default_csv_options.merge(csv_options)
21
+ @logger = logger
22
+ end
23
+
24
+ attr_accessor :fields
25
+ attr_reader :extractor
26
+ attr_reader :csv_options
27
+
28
+ def field_symbolizer
29
+ self.class.default_csv_options[:header_converters]
30
+ end
31
+
32
+ def extract
33
+ Array(@extractor.extract).tap { |x| raise "Multiple files not supported" if x.size > 1 }
34
+ end
35
+
36
+ def extractor=(arg)
37
+ case arg
38
+ when Extractor::SftpFile, Extractor::LocalFile
39
+ @extractor = arg
40
+ when String
41
+ @extractor = Extractor::LocalFile.new(arg)
42
+ when Regexp
43
+ raise "Adding regex matching to local files would be easy, not done yet"
44
+ else
45
+ raise "Unknown extractor of type #{arg.class}: #{arg}"
46
+ end
47
+ end
48
+
49
+ # Only going to support single file for now
50
+ def source_filename
51
+ @source_filename ||= extract.first
52
+ end
53
+
54
+ def first_line
55
+ # Readline assumes \n line endings. Strip out \r if it is a DOS file.
56
+ @first_line ||= File.open(source_filename) do |f|
57
+ f.readline.gsub(/\r/,'')
58
+ end
59
+ end
60
+
61
+ def headers
62
+ @headers ||= CSV.open(source_filename, 'r', source_csv_options) { |csv| csv.first }.headers
63
+ end
64
+
65
+ def valid_headers?
66
+ (fields.keys - headers).empty?
67
+ end
68
+
69
+ def to_dataframe
70
+ @logger.info "Converting #{source_filename} to a dataframe"
71
+ Daru::DataFrame.from_csv source_filename, @csv_options
72
+ end
73
+
74
+ def df
75
+ @dataframe ||= to_dataframe
76
+ end
77
+ end
78
+ end
79
+ end
@@ -0,0 +1,10 @@
1
+ module Remi
2
+ module DataSource
3
+ class DataFrame
4
+ include DataSubject
5
+
6
+ def initialize(**args)
7
+ end
8
+ end
9
+ end
10
+ end
@@ -0,0 +1,58 @@
1
+ module Remi
2
+ module DataSource
3
+ class Postgres
4
+ include DataSource
5
+
6
+ def initialize(fields: {}, credentials:, query:, logger: Remi::Settings.logger)
7
+ @fields = fields
8
+ @credentials = credentials
9
+ @query = query
10
+ @logger = logger
11
+ end
12
+
13
+ attr_accessor :fields
14
+
15
+ def extract
16
+ @logger.info "Executing query #{@query}"
17
+ @raw_result = pg_conn.exec @query
18
+ end
19
+
20
+ def raw_result
21
+ @raw_result ||= extract
22
+ end
23
+
24
+ def pg_conn
25
+ @pg_conn ||= PG.connect(
26
+ host: @credentials[:host] || 'localhost',
27
+ port: @credentials[:port] || 5432,
28
+ dbname: @credentials[:dbname],
29
+ user: @credentials[:user] || `whoami`.chomp,
30
+ password: @credentials[:password],
31
+ sslmode: @credentials[:sslmode] || 'require'
32
+ )
33
+ end
34
+
35
+
36
+ def to_dataframe
37
+ # Performance for larger sets could be improved by using bulk query (via COPY)
38
+ @logger.info "Converting query to a dataframe"
39
+
40
+ hash_array = {}
41
+ raw_result.each do |row|
42
+ row.each do |field, value|
43
+ (hash_array[field_symbolizer.call(field)] ||= []) << value
44
+ end
45
+ end
46
+
47
+ # After converting to DF, clear the PG results to save memory.
48
+ raw_result.clear
49
+
50
+ Daru::DataFrame.new hash_array, order: hash_array.keys
51
+ end
52
+
53
+ def df
54
+ @dataframe ||= to_dataframe
55
+ end
56
+ end
57
+ end
58
+ end
@@ -0,0 +1,78 @@
1
+ module Remi
2
+ module DataSource
3
+ class Salesforce
4
+ include DataSource
5
+
6
+ def initialize(fields: {}, object:, query:, credentials:, api: :bulk, logger: Remi::Settings.logger)
7
+ @fields = fields
8
+ @sfo = object
9
+ @query = query
10
+ @credentials = credentials
11
+ @api = api
12
+ @logger = logger
13
+ end
14
+
15
+ attr_accessor :fields
16
+ attr_accessor :raw_result
17
+
18
+ def field_symbolizer
19
+ Remi::FieldSymbolizers[:salesforce]
20
+ end
21
+
22
+ def extract
23
+ @raw_result = sf_bulk.query(@sfo, @query, 10000)
24
+ end
25
+
26
+ def raw_result
27
+ @raw_result ||= extract
28
+ end
29
+
30
+
31
+
32
+ def restforce_client
33
+ @restforce_client ||= begin
34
+ client = Restforce.new(@credentials)
35
+
36
+ #run a dummy query to initiate a connection. Workaround for Bulk API problem
37
+ # https://github.com/yatish27/salesforce_bulk_api/issues/33
38
+ client.query('SELECT Id FROM Contact LIMIT 1')
39
+ client
40
+ end
41
+ end
42
+
43
+ def sf_bulk
44
+ @sf_bulk ||= SalesforceBulkApi::Api.new(restforce_client).tap { |o| o.connection.set_status_throttle(5) }
45
+ end
46
+
47
+ def to_dataframe
48
+ @logger.info "Converting salesforce query results to a dataframe"
49
+
50
+ hash_array = {}
51
+ raw_result['batches'].each do |batch|
52
+ next unless batch['response']
53
+
54
+ batch['response'].each do |record|
55
+ record.each do |field, value|
56
+ next if ['xsi:type','type'].include? field
57
+ (hash_array[field.to_sym] ||= []) << case value.first
58
+ when Hash
59
+ value.first["xsi:nil"] == "true" ? nil : value.first
60
+ else
61
+ value.first
62
+ end
63
+ end
64
+ end
65
+
66
+ # delete raw result at end of processing to free memory
67
+ batch['response'] = nil
68
+ end
69
+
70
+ Daru::DataFrame.new hash_array, order: hash_array.keys
71
+ end
72
+
73
+ def df
74
+ @dataframe ||= to_dataframe
75
+ end
76
+ end
77
+ end
78
+ end
@@ -0,0 +1,25 @@
1
+ module Remi
2
+ module DataSubject
3
+ def field_symbolizer
4
+ Remi::FieldSymbolizers[:standard]
5
+ end
6
+
7
+ def df
8
+ @dataframe ||= Daru::DataFrame.new([])
9
+ end
10
+
11
+ def df=(new_dataframe)
12
+ @dataframe = new_dataframe
13
+ end
14
+
15
+ # Fields is a hash where the keys are the data field names and the values
16
+ # are a hash of metadata. DataFrames do not currently support metadata,
17
+ # so the metdata will be empty unless overridden by the specific target.
18
+ def fields
19
+ df.vectors.to_a.reduce({}) do |h, v|
20
+ h[v] = {}
21
+ h
22
+ end
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,15 @@
1
+ module Remi
2
+ module DataTarget
3
+ include DataSubject
4
+
5
+ # Gets called automatically at the end of a job, but could
6
+ # also get manually called at the end of a transform so make
7
+ # sure it doesn't do it twice.
8
+ def load
9
+ @logger.info "Loading target"
10
+ return true if @loaded
11
+ @loaded = true
12
+ raise "Load function undefined for #{self.class.name}"
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,49 @@
1
+ module Remi
2
+ module DataTarget
3
+ class Salesforce
4
+ include DataTarget
5
+
6
+ def initialize(object:, operation:, credentials:, api: :bulk, logger: Remi::Settings.logger)
7
+ @sfo = object
8
+ @operation = operation
9
+ @credentials = credentials
10
+ @api = api
11
+ @logger = logger
12
+ end
13
+
14
+ def field_symbolizer
15
+ Remi::FieldSymbolizers[:salesforce]
16
+ end
17
+
18
+ def load
19
+ return true if @loaded || df.size == 0
20
+
21
+ @logger.info "Performing Salesforce #{@operation} on object #{@sfo}"
22
+
23
+ if @operation == :update
24
+ Remi::SfBulkHelper::SfBulkUpdate.update(restforce_client, @sfo, df_as_array_of_hashes, logger: @logger)
25
+ elsif @operation == :create
26
+ Remi::SfBulkHelper::SfBulkCreate.create(restforce_client, @sfo, df_as_array_of_hashes, logger: @logger)
27
+ end
28
+
29
+ @loaded = true
30
+ end
31
+
32
+ def restforce_client
33
+ @restforce_client ||= begin
34
+ client = Restforce.new(@credentials)
35
+
36
+ #run a dummy query to initiate a connection. Workaround for Bulk API problem
37
+ # https://github.com/yatish27/salesforce_bulk_api/issues/33
38
+ client.query('SELECT Id FROM Contact LIMIT 1')
39
+ client
40
+ end
41
+ end
42
+
43
+ def df_as_array_of_hashes
44
+ df.to_a[0]
45
+ end
46
+
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,14 @@
1
+ module Remi
2
+ module DataTarget
3
+ class DataFrame
4
+ include DataSubject
5
+
6
+ def initialize(**args)
7
+ end
8
+
9
+ def load
10
+ true
11
+ end
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,49 @@
1
+ module Remi
2
+ module DataTarget
3
+ class Salesforce
4
+ include DataTarget
5
+
6
+ def initialize(object:, operation:, credentials:, api: :bulk, logger: Remi::Settings.logger)
7
+ @sfo = object
8
+ @operation = operation
9
+ @credentials = credentials
10
+ @api = api
11
+ @logger = logger
12
+ end
13
+
14
+ def field_symbolizer
15
+ Remi::FieldSymbolizers[:salesforce]
16
+ end
17
+
18
+ def load
19
+ return true if @loaded || df.size == 0
20
+
21
+ @logger.info "Performing Salesforce #{@operation} on object #{@sfo}"
22
+
23
+ if @operation == :update
24
+ Remi::SfBulkHelper::SfBulkUpdate.update(restforce_client, @sfo, df_as_array_of_hashes, logger: @logger)
25
+ elsif @operation == :create
26
+ Remi::SfBulkHelper::SfBulkCreate.create(restforce_client, @sfo, df_as_array_of_hashes, logger: @logger)
27
+ end
28
+
29
+ @loaded = true
30
+ end
31
+
32
+ def restforce_client
33
+ @restforce_client ||= begin
34
+ client = Restforce.new(@credentials)
35
+
36
+ #run a dummy query to initiate a connection. Workaround for Bulk API problem
37
+ # https://github.com/yatish27/salesforce_bulk_api/issues/33
38
+ client.query('SELECT Id FROM Contact LIMIT 1')
39
+ client
40
+ end
41
+ end
42
+
43
+ def df_as_array_of_hashes
44
+ df.to_a[0]
45
+ end
46
+
47
+ end
48
+ end
49
+ end