remi 0.0.1 → 0.2.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (60) hide show
  1. checksums.yaml +4 -4
  2. data/.bundle/config +2 -0
  3. data/.gitignore +3 -2
  4. data/.rspec +2 -0
  5. data/.ruby-version +1 -0
  6. data/Gemfile +4 -0
  7. data/Gemfile.lock +123 -0
  8. data/LICENSE.txt +21 -0
  9. data/README.md +94 -3
  10. data/bin/remi +8 -0
  11. data/doc/install-rbenv-os_x.md +47 -0
  12. data/lib/remi.rb +56 -9
  13. data/lib/remi/cli.rb +56 -0
  14. data/lib/remi/core/daru.rb +28 -0
  15. data/lib/remi/core/refinements.rb +21 -0
  16. data/lib/remi/core/string.rb +8 -0
  17. data/lib/remi/cucumber.rb +7 -0
  18. data/lib/remi/cucumber/business_rules.rb +504 -0
  19. data/lib/remi/cucumber/data_source.rb +63 -0
  20. data/lib/remi/data_source.rb +13 -0
  21. data/lib/remi/data_source/csv_file.rb +79 -0
  22. data/lib/remi/data_source/data_frame.rb +10 -0
  23. data/lib/remi/data_source/postgres.rb +58 -0
  24. data/lib/remi/data_source/salesforce.rb +78 -0
  25. data/lib/remi/data_subject.rb +25 -0
  26. data/lib/remi/data_target.rb +15 -0
  27. data/lib/remi/data_target/csv_file.rb +49 -0
  28. data/lib/remi/data_target/data_frame.rb +14 -0
  29. data/lib/remi/data_target/salesforce.rb +49 -0
  30. data/lib/remi/extractor/sftp_file.rb +84 -0
  31. data/lib/remi/field_symbolizers.rb +17 -0
  32. data/lib/remi/job.rb +200 -0
  33. data/lib/remi/lookup/regex_sieve.rb +55 -0
  34. data/lib/remi/project/features/examples.feature +24 -0
  35. data/lib/remi/project/features/formulas.feature +64 -0
  36. data/lib/remi/project/features/sample_job.feature +304 -0
  37. data/lib/remi/project/features/step_definitions/remi_step.rb +310 -0
  38. data/lib/remi/project/features/support/env.rb +10 -0
  39. data/lib/remi/project/features/support/env_app.rb +3 -0
  40. data/lib/remi/project/features/transforms/date_diff.feature +50 -0
  41. data/lib/remi/project/features/transforms/parse_date.feature +34 -0
  42. data/lib/remi/project/features/transforms/prefix.feature +15 -0
  43. data/lib/remi/project/jobs/all_jobs_shared.rb +25 -0
  44. data/lib/remi/project/jobs/copy_source_job.rb +12 -0
  45. data/lib/remi/project/jobs/sample_job.rb +164 -0
  46. data/lib/remi/project/jobs/transforms/date_diff_job.rb +17 -0
  47. data/lib/remi/project/jobs/transforms/parse_date_job.rb +18 -0
  48. data/lib/remi/project/jobs/transforms/prefix_job.rb +16 -0
  49. data/lib/remi/project/jobs/transforms/transform_jobs.rb +3 -0
  50. data/lib/remi/settings.rb +39 -0
  51. data/lib/remi/sf_bulk_helper.rb +265 -0
  52. data/lib/remi/source_to_target_map.rb +93 -0
  53. data/lib/remi/transform.rb +137 -0
  54. data/lib/remi/version.rb +3 -0
  55. data/remi.gemspec +25 -7
  56. data/workbooks/sample_workbook.ipynb +56 -0
  57. data/workbooks/workbook_helper.rb +1 -0
  58. metadata +234 -17
  59. data/lib/noodling.rb +0 -163
  60. data/test/test_NAME.rb +0 -19
@@ -0,0 +1,63 @@
1
+ module Remi
2
+ module DataSource
3
+ module DataStub
4
+ def stub_row_array
5
+ @fields.values.map do |attrib|
6
+ case attrib[:type]
7
+ when :date
8
+ stub_values[:date].strftime(attrib[:format])
9
+ when nil
10
+ stub_values[:string]
11
+ else
12
+ stub_values[attrib[:type]]
13
+ end
14
+ end
15
+ end
16
+
17
+ def stub_df
18
+ wdf = Daru::DataFrame.new([], order: @fields.keys)
19
+ wdf.add_row(stub_row_array)
20
+ self.df = wdf
21
+ end
22
+
23
+ def stub_values
24
+ @stub_values ||= {
25
+ string: "Some String",
26
+ number: 133,
27
+ float: 3.14159,
28
+ integer: 38,
29
+ date: Date.parse('2015-10-21')
30
+ }
31
+ end
32
+ end
33
+
34
+
35
+ class CsvFile
36
+ include DataStub
37
+ def stub_tmp_file
38
+ @stub_tmp_file ||= Tempfile.new('stub_tmp_file.csv').path
39
+ end
40
+
41
+ def write_stub_tmp_file
42
+ File.open(stub_tmp_file, "wb") do |file|
43
+ file.puts stub_header
44
+ file.puts stub_row_csv
45
+ end
46
+
47
+ stub_tmp_file
48
+ end
49
+
50
+ def stub_header
51
+ @fields.keys.join(@csv_options[:col_sep])
52
+ end
53
+
54
+ def stub_row_csv
55
+ stub_row_array.join(@csv_options[:col_sep])
56
+ end
57
+ end
58
+
59
+ class Salesforce
60
+ include DataStub
61
+ end
62
+ end
63
+ end
@@ -0,0 +1,13 @@
1
+ module Remi
2
+ module DataSource
3
+ include DataSubject
4
+
5
+ def extract
6
+ raise "Extract function undefined for #{self.class.name}"
7
+ end
8
+
9
+ def feild_symbolizer
10
+ Remi::FieldSymbolizers[:standard]
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,79 @@
1
+ module Remi
2
+ module DataSource
3
+ class CsvFile
4
+ include DataSource
5
+
6
+ def self.default_csv_options
7
+ CSV::DEFAULT_OPTIONS.merge({
8
+ headers: true,
9
+ header_converters: Remi::FieldSymbolizers[:standard],
10
+ col_sep: ',',
11
+ encoding: 'UTF-8',
12
+ quote_char: '"'
13
+ })
14
+ end
15
+
16
+
17
+ def initialize(fields: {}, extractor:, csv_options: {}, logger: Remi::Settings.logger)
18
+ @fields = fields
19
+ self.extractor = extractor
20
+ @csv_options = self.class.default_csv_options.merge(csv_options)
21
+ @logger = logger
22
+ end
23
+
24
+ attr_accessor :fields
25
+ attr_reader :extractor
26
+ attr_reader :csv_options
27
+
28
+ def field_symbolizer
29
+ self.class.default_csv_options[:header_converters]
30
+ end
31
+
32
+ def extract
33
+ Array(@extractor.extract).tap { |x| raise "Multiple files not supported" if x.size > 1 }
34
+ end
35
+
36
+ def extractor=(arg)
37
+ case arg
38
+ when Extractor::SftpFile, Extractor::LocalFile
39
+ @extractor = arg
40
+ when String
41
+ @extractor = Extractor::LocalFile.new(arg)
42
+ when Regexp
43
+ raise "Adding regex matching to local files would be easy, not done yet"
44
+ else
45
+ raise "Unknown extractor of type #{arg.class}: #{arg}"
46
+ end
47
+ end
48
+
49
+ # Only going to support single file for now
50
+ def source_filename
51
+ @source_filename ||= extract.first
52
+ end
53
+
54
+ def first_line
55
+ # Readline assumes \n line endings. Strip out \r if it is a DOS file.
56
+ @first_line ||= File.open(source_filename) do |f|
57
+ f.readline.gsub(/\r/,'')
58
+ end
59
+ end
60
+
61
+ def headers
62
+ @headers ||= CSV.open(source_filename, 'r', source_csv_options) { |csv| csv.first }.headers
63
+ end
64
+
65
+ def valid_headers?
66
+ (fields.keys - headers).empty?
67
+ end
68
+
69
+ def to_dataframe
70
+ @logger.info "Converting #{source_filename} to a dataframe"
71
+ Daru::DataFrame.from_csv source_filename, @csv_options
72
+ end
73
+
74
+ def df
75
+ @dataframe ||= to_dataframe
76
+ end
77
+ end
78
+ end
79
+ end
@@ -0,0 +1,10 @@
1
+ module Remi
2
+ module DataSource
3
+ class DataFrame
4
+ include DataSubject
5
+
6
+ def initialize(**args)
7
+ end
8
+ end
9
+ end
10
+ end
@@ -0,0 +1,58 @@
1
+ module Remi
2
+ module DataSource
3
+ class Postgres
4
+ include DataSource
5
+
6
+ def initialize(fields: {}, credentials:, query:, logger: Remi::Settings.logger)
7
+ @fields = fields
8
+ @credentials = credentials
9
+ @query = query
10
+ @logger = logger
11
+ end
12
+
13
+ attr_accessor :fields
14
+
15
+ def extract
16
+ @logger.info "Executing query #{@query}"
17
+ @raw_result = pg_conn.exec @query
18
+ end
19
+
20
+ def raw_result
21
+ @raw_result ||= extract
22
+ end
23
+
24
+ def pg_conn
25
+ @pg_conn ||= PG.connect(
26
+ host: @credentials[:host] || 'localhost',
27
+ port: @credentials[:port] || 5432,
28
+ dbname: @credentials[:dbname],
29
+ user: @credentials[:user] || `whoami`.chomp,
30
+ password: @credentials[:password],
31
+ sslmode: @credentials[:sslmode] || 'require'
32
+ )
33
+ end
34
+
35
+
36
+ def to_dataframe
37
+ # Performance for larger sets could be improved by using bulk query (via COPY)
38
+ @logger.info "Converting query to a dataframe"
39
+
40
+ hash_array = {}
41
+ raw_result.each do |row|
42
+ row.each do |field, value|
43
+ (hash_array[field_symbolizer.call(field)] ||= []) << value
44
+ end
45
+ end
46
+
47
+ # After converting to DF, clear the PG results to save memory.
48
+ raw_result.clear
49
+
50
+ Daru::DataFrame.new hash_array, order: hash_array.keys
51
+ end
52
+
53
+ def df
54
+ @dataframe ||= to_dataframe
55
+ end
56
+ end
57
+ end
58
+ end
@@ -0,0 +1,78 @@
1
+ module Remi
2
+ module DataSource
3
+ class Salesforce
4
+ include DataSource
5
+
6
+ def initialize(fields: {}, object:, query:, credentials:, api: :bulk, logger: Remi::Settings.logger)
7
+ @fields = fields
8
+ @sfo = object
9
+ @query = query
10
+ @credentials = credentials
11
+ @api = api
12
+ @logger = logger
13
+ end
14
+
15
+ attr_accessor :fields
16
+ attr_accessor :raw_result
17
+
18
+ def field_symbolizer
19
+ Remi::FieldSymbolizers[:salesforce]
20
+ end
21
+
22
+ def extract
23
+ @raw_result = sf_bulk.query(@sfo, @query, 10000)
24
+ end
25
+
26
+ def raw_result
27
+ @raw_result ||= extract
28
+ end
29
+
30
+
31
+
32
+ def restforce_client
33
+ @restforce_client ||= begin
34
+ client = Restforce.new(@credentials)
35
+
36
+ #run a dummy query to initiate a connection. Workaround for Bulk API problem
37
+ # https://github.com/yatish27/salesforce_bulk_api/issues/33
38
+ client.query('SELECT Id FROM Contact LIMIT 1')
39
+ client
40
+ end
41
+ end
42
+
43
+ def sf_bulk
44
+ @sf_bulk ||= SalesforceBulkApi::Api.new(restforce_client).tap { |o| o.connection.set_status_throttle(5) }
45
+ end
46
+
47
+ def to_dataframe
48
+ @logger.info "Converting salesforce query results to a dataframe"
49
+
50
+ hash_array = {}
51
+ raw_result['batches'].each do |batch|
52
+ next unless batch['response']
53
+
54
+ batch['response'].each do |record|
55
+ record.each do |field, value|
56
+ next if ['xsi:type','type'].include? field
57
+ (hash_array[field.to_sym] ||= []) << case value.first
58
+ when Hash
59
+ value.first["xsi:nil"] == "true" ? nil : value.first
60
+ else
61
+ value.first
62
+ end
63
+ end
64
+ end
65
+
66
+ # delete raw result at end of processing to free memory
67
+ batch['response'] = nil
68
+ end
69
+
70
+ Daru::DataFrame.new hash_array, order: hash_array.keys
71
+ end
72
+
73
+ def df
74
+ @dataframe ||= to_dataframe
75
+ end
76
+ end
77
+ end
78
+ end
@@ -0,0 +1,25 @@
1
+ module Remi
2
+ module DataSubject
3
+ def field_symbolizer
4
+ Remi::FieldSymbolizers[:standard]
5
+ end
6
+
7
+ def df
8
+ @dataframe ||= Daru::DataFrame.new([])
9
+ end
10
+
11
+ def df=(new_dataframe)
12
+ @dataframe = new_dataframe
13
+ end
14
+
15
+ # Fields is a hash where the keys are the data field names and the values
16
+ # are a hash of metadata. DataFrames do not currently support metadata,
17
+ # so the metdata will be empty unless overridden by the specific target.
18
+ def fields
19
+ df.vectors.to_a.reduce({}) do |h, v|
20
+ h[v] = {}
21
+ h
22
+ end
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,15 @@
1
+ module Remi
2
+ module DataTarget
3
+ include DataSubject
4
+
5
+ # Gets called automatically at the end of a job, but could
6
+ # also get manually called at the end of a transform so make
7
+ # sure it doesn't do it twice.
8
+ def load
9
+ @logger.info "Loading target"
10
+ return true if @loaded
11
+ @loaded = true
12
+ raise "Load function undefined for #{self.class.name}"
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,49 @@
1
+ module Remi
2
+ module DataTarget
3
+ class Salesforce
4
+ include DataTarget
5
+
6
+ def initialize(object:, operation:, credentials:, api: :bulk, logger: Remi::Settings.logger)
7
+ @sfo = object
8
+ @operation = operation
9
+ @credentials = credentials
10
+ @api = api
11
+ @logger = logger
12
+ end
13
+
14
+ def field_symbolizer
15
+ Remi::FieldSymbolizers[:salesforce]
16
+ end
17
+
18
+ def load
19
+ return true if @loaded || df.size == 0
20
+
21
+ @logger.info "Performing Salesforce #{@operation} on object #{@sfo}"
22
+
23
+ if @operation == :update
24
+ Remi::SfBulkHelper::SfBulkUpdate.update(restforce_client, @sfo, df_as_array_of_hashes, logger: @logger)
25
+ elsif @operation == :create
26
+ Remi::SfBulkHelper::SfBulkCreate.create(restforce_client, @sfo, df_as_array_of_hashes, logger: @logger)
27
+ end
28
+
29
+ @loaded = true
30
+ end
31
+
32
+ def restforce_client
33
+ @restforce_client ||= begin
34
+ client = Restforce.new(@credentials)
35
+
36
+ #run a dummy query to initiate a connection. Workaround for Bulk API problem
37
+ # https://github.com/yatish27/salesforce_bulk_api/issues/33
38
+ client.query('SELECT Id FROM Contact LIMIT 1')
39
+ client
40
+ end
41
+ end
42
+
43
+ def df_as_array_of_hashes
44
+ df.to_a[0]
45
+ end
46
+
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,14 @@
1
+ module Remi
2
+ module DataTarget
3
+ class DataFrame
4
+ include DataSubject
5
+
6
+ def initialize(**args)
7
+ end
8
+
9
+ def load
10
+ true
11
+ end
12
+ end
13
+ end
14
+ end
@@ -0,0 +1,49 @@
1
+ module Remi
2
+ module DataTarget
3
+ class Salesforce
4
+ include DataTarget
5
+
6
+ def initialize(object:, operation:, credentials:, api: :bulk, logger: Remi::Settings.logger)
7
+ @sfo = object
8
+ @operation = operation
9
+ @credentials = credentials
10
+ @api = api
11
+ @logger = logger
12
+ end
13
+
14
+ def field_symbolizer
15
+ Remi::FieldSymbolizers[:salesforce]
16
+ end
17
+
18
+ def load
19
+ return true if @loaded || df.size == 0
20
+
21
+ @logger.info "Performing Salesforce #{@operation} on object #{@sfo}"
22
+
23
+ if @operation == :update
24
+ Remi::SfBulkHelper::SfBulkUpdate.update(restforce_client, @sfo, df_as_array_of_hashes, logger: @logger)
25
+ elsif @operation == :create
26
+ Remi::SfBulkHelper::SfBulkCreate.create(restforce_client, @sfo, df_as_array_of_hashes, logger: @logger)
27
+ end
28
+
29
+ @loaded = true
30
+ end
31
+
32
+ def restforce_client
33
+ @restforce_client ||= begin
34
+ client = Restforce.new(@credentials)
35
+
36
+ #run a dummy query to initiate a connection. Workaround for Bulk API problem
37
+ # https://github.com/yatish27/salesforce_bulk_api/issues/33
38
+ client.query('SELECT Id FROM Contact LIMIT 1')
39
+ client
40
+ end
41
+ end
42
+
43
+ def df_as_array_of_hashes
44
+ df.to_a[0]
45
+ end
46
+
47
+ end
48
+ end
49
+ end