remi 0.2.42 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. checksums.yaml +4 -4
  2. data/.yardopts +7 -0
  3. data/Gemfile +1 -1
  4. data/Gemfile.lock +13 -26
  5. data/README.md +1 -1
  6. data/features/step_definitions/remi_step.rb +33 -13
  7. data/features/sub_job_example.feature +24 -0
  8. data/features/sub_transform_example.feature +35 -0
  9. data/features/sub_transform_many_to_many.feature +49 -0
  10. data/features/support/env_app.rb +1 -1
  11. data/jobs/all_jobs_shared.rb +19 -16
  12. data/jobs/copy_source_job.rb +11 -9
  13. data/jobs/csv_file_target_job.rb +10 -9
  14. data/jobs/json_job.rb +18 -14
  15. data/jobs/metadata_job.rb +33 -28
  16. data/jobs/parameters_job.rb +14 -11
  17. data/jobs/sample_job.rb +106 -77
  18. data/jobs/sftp_file_target_job.rb +14 -13
  19. data/jobs/sub_job_example_job.rb +86 -0
  20. data/jobs/sub_transform_example_job.rb +43 -0
  21. data/jobs/sub_transform_many_to_many_job.rb +46 -0
  22. data/jobs/transforms/concatenate_job.rb +16 -12
  23. data/jobs/transforms/data_frame_sieve_job.rb +24 -19
  24. data/jobs/transforms/date_diff_job.rb +15 -11
  25. data/jobs/transforms/nvl_job.rb +16 -12
  26. data/jobs/transforms/parse_date_job.rb +17 -14
  27. data/jobs/transforms/partitioner_job.rb +27 -19
  28. data/jobs/transforms/prefix_job.rb +13 -10
  29. data/jobs/transforms/truncate_job.rb +14 -10
  30. data/jobs/transforms/truthy_job.rb +11 -8
  31. data/lib/remi.rb +25 -11
  32. data/lib/remi/data_frame.rb +4 -4
  33. data/lib/remi/data_frame/daru.rb +1 -37
  34. data/lib/remi/data_subject.rb +234 -48
  35. data/lib/remi/data_subjects/csv_file.rb +171 -0
  36. data/lib/remi/data_subjects/data_frame.rb +106 -0
  37. data/lib/remi/data_subjects/file_system.rb +115 -0
  38. data/lib/remi/data_subjects/local_file.rb +109 -0
  39. data/lib/remi/data_subjects/none.rb +31 -0
  40. data/lib/remi/data_subjects/postgres.rb +186 -0
  41. data/lib/remi/data_subjects/s3_file.rb +84 -0
  42. data/lib/remi/data_subjects/salesforce.rb +211 -0
  43. data/lib/remi/data_subjects/sftp_file.rb +196 -0
  44. data/lib/remi/data_subjects/sub_job.rb +50 -0
  45. data/lib/remi/dsl.rb +74 -0
  46. data/lib/remi/encoder.rb +45 -0
  47. data/lib/remi/extractor.rb +21 -0
  48. data/lib/remi/field_symbolizers.rb +1 -0
  49. data/lib/remi/job.rb +279 -113
  50. data/lib/remi/job/parameters.rb +90 -0
  51. data/lib/remi/job/sub_job.rb +35 -0
  52. data/lib/remi/job/transform.rb +165 -0
  53. data/lib/remi/loader.rb +22 -0
  54. data/lib/remi/monkeys/daru.rb +4 -0
  55. data/lib/remi/parser.rb +44 -0
  56. data/lib/remi/testing/business_rules.rb +17 -23
  57. data/lib/remi/testing/data_stub.rb +2 -2
  58. data/lib/remi/version.rb +1 -1
  59. data/remi.gemspec +3 -0
  60. data/spec/data_subject_spec.rb +475 -11
  61. data/spec/data_subjects/csv_file_spec.rb +69 -0
  62. data/spec/data_subjects/data_frame_spec.rb +52 -0
  63. data/spec/{extractor → data_subjects}/file_system_spec.rb +0 -0
  64. data/spec/{extractor → data_subjects}/local_file_spec.rb +0 -0
  65. data/spec/data_subjects/none_spec.rb +41 -0
  66. data/spec/data_subjects/postgres_spec.rb +80 -0
  67. data/spec/{extractor → data_subjects}/s3_file_spec.rb +0 -0
  68. data/spec/data_subjects/salesforce_spec.rb +117 -0
  69. data/spec/{extractor → data_subjects}/sftp_file_spec.rb +16 -0
  70. data/spec/data_subjects/sub_job_spec.rb +33 -0
  71. data/spec/encoder_spec.rb +38 -0
  72. data/spec/extractor_spec.rb +11 -0
  73. data/spec/fixtures/sf_bulk_helper_stubs.rb +443 -0
  74. data/spec/job/transform_spec.rb +257 -0
  75. data/spec/job_spec.rb +507 -0
  76. data/spec/loader_spec.rb +11 -0
  77. data/spec/parser_spec.rb +38 -0
  78. data/spec/sf_bulk_helper_spec.rb +117 -0
  79. data/spec/testing/data_stub_spec.rb +5 -3
  80. metadata +109 -27
  81. data/features/aggregate.feature +0 -42
  82. data/jobs/aggregate_job.rb +0 -31
  83. data/jobs/transforms/transform_jobs.rb +0 -4
  84. data/lib/remi/data_subject/csv_file.rb +0 -162
  85. data/lib/remi/data_subject/data_frame.rb +0 -52
  86. data/lib/remi/data_subject/postgres.rb +0 -134
  87. data/lib/remi/data_subject/salesforce.rb +0 -136
  88. data/lib/remi/data_subject/sftp_file.rb +0 -65
  89. data/lib/remi/extractor/file_system.rb +0 -92
  90. data/lib/remi/extractor/local_file.rb +0 -43
  91. data/lib/remi/extractor/s3_file.rb +0 -57
  92. data/lib/remi/extractor/sftp_file.rb +0 -83
  93. data/spec/data_subject/csv_file_spec.rb +0 -79
  94. data/spec/data_subject/data_frame.rb +0 -27
@@ -1,65 +0,0 @@
1
- module Remi
2
-
3
- class DataTarget::SftpFile < DataTarget
4
-
5
- def initialize(*args, **kargs, &block)
6
- super
7
- init_sftp_file(*args, **kargs, &block)
8
- end
9
-
10
- attr_reader :local_path
11
- attr_reader :remote_path
12
-
13
- # Public: Performs the load operation, regardless of whether it has
14
- # already executed.
15
- #
16
- # Returns true if the load operation was successful
17
- def load!
18
- @logger.info "Uploading #{@local_path} to #{@credentials[:username]}@#{@credentials[:host]}: #{@remote_path}"
19
- connection do |sftp|
20
- retry_upload { sftp.upload! @local_path, @remote_path }
21
- end
22
-
23
- true
24
- end
25
-
26
-
27
- private
28
-
29
- def init_sftp_file(*args, credentials:, local_path:, remote_path: File.basename(local_path), **kargs, &block)
30
- @credentials = credentials
31
- @local_path = local_path
32
- @remote_path = remote_path
33
- init_df
34
- end
35
-
36
- def init_df
37
- parameter_df = Daru::DataFrame.new(
38
- local_path: Array(@local_path),
39
- remote_path: Array(@remote_path)
40
- )
41
- self.df = parameter_df
42
- end
43
-
44
- def connection(&block)
45
- result = nil
46
- Net::SFTP.start(@credentials[:host], @credentials[:username], password: @credentials[:password], port: @credentials[:port] || '22') do |sftp|
47
- result = yield sftp
48
- end
49
- result
50
- end
51
-
52
- def retry_upload(ntry=2, &block)
53
- 1.upto(ntry).each do |itry|
54
- begin
55
- block.call
56
- rescue RuntimeError => err
57
- raise err unless itry < ntry
58
- @logger.error "Upload failed with error: #{err.message}"
59
- @logger.error "Retry attempt #{itry}/#{ntry-1}"
60
- sleep(1)
61
- end
62
- end
63
- end
64
- end
65
- end
@@ -1,92 +0,0 @@
1
- module Remi
2
- module Extractor
3
-
4
- class FileSystemEntry
5
- def initialize(pathname:, create_time:, modified_time:, raw: nil)
6
- @pathname = Pathname.new(pathname)
7
- @create_time = create_time
8
- @modified_time = modified_time
9
- @raw = raw
10
- end
11
-
12
- attr_reader :pathname, :create_time, :modified_time, :raw
13
-
14
- def name
15
- @pathname.basename.to_s
16
- end
17
- end
18
-
19
-
20
- class FileSystem
21
- class FileNotFoundError < StandardError; end
22
-
23
- def initialize(*args, remote_path:, pattern: /.*/, local_path: Settings.work_dir, most_recent_only: false, group_by: nil, most_recent_by: :create_time, logger: Remi::Settings.logger, **kargs, &block)
24
- @remote_path = Pathname.new(remote_path)
25
- @pattern = pattern
26
- @local_path = Pathname.new(local_path)
27
- @most_recent_only = most_recent_only
28
- @group_by = group_by
29
- @most_recent_by = most_recent_by
30
- @logger = logger
31
- end
32
-
33
- attr_reader :remote_path
34
- attr_reader :pattern
35
- attr_reader :local_path
36
- attr_reader :most_recent_only
37
- attr_reader :group_by
38
- attr_reader :most_recent_by
39
- attr_reader :logger
40
-
41
- # Public: Called to extract files from the source filesystem.
42
- #
43
- # Returns an array with containing the paths to all files extracted.
44
- def extract
45
- raise NoMethodError, "#{__method__} not defined for#{self.class.name}"
46
- end
47
-
48
- # Public: Returns an array of all FileSystemEntry instances that are in the remote_path.
49
- # NOTE: all_entries is responsible for matching the path using @remote_path
50
- def all_entries
51
- raise NoMethodError, "#{__method__} not defined for#{self.class.name}"
52
- end
53
-
54
- # Public: Returns just the entries that are to be extracted.
55
- def entries
56
- if @group_by
57
- most_recent_matching_entry_in_group
58
- elsif @most_recent_only
59
- Array(most_recent_matching_entry)
60
- else
61
- matching_entries
62
- end
63
- end
64
-
65
- def matching_entries
66
- all_entries.select { |e| @pattern.match e.name }
67
- end
68
-
69
- def most_recent_matching_entry
70
- matching_entries.sort_by { |e| e.send(@most_recent_by) }.reverse.first
71
- end
72
-
73
- def most_recent_matching_entry_in_group
74
- entries_with_group = matching_entries.map do |entry|
75
- match = entry.name.match(@group_by)
76
- next unless match
77
-
78
- group = match.to_a[1..-1]
79
- { group: group, entry: entry }
80
- end.compact
81
- sorted_entries_with_group = entries_with_group.sort_by { |e| [e[:group], e[:entry].send(@most_recent_by)] }.reverse
82
-
83
- last_group = nil
84
- sorted_entries_with_group.map do |entry|
85
- next unless entry[:group] != last_group
86
- last_group = entry[:group]
87
- entry[:entry]
88
- end.compact
89
- end
90
- end
91
- end
92
- end
@@ -1,43 +0,0 @@
1
- module Remi
2
- module Extractor
3
-
4
- class LocalFile < FileSystem
5
- def initialize(*args, **kargs)
6
- super
7
- init_local_file(*args, **kargs)
8
- end
9
-
10
- # Public: Called to extract files from the source filesystem.
11
- #
12
- # Returns an array with containing the paths to all files extracted.
13
- def extract
14
- entries.map(&:pathname)
15
- end
16
-
17
- # Public: Returns an array of all FileSystemEntry instances that are in the remote_path.
18
- def all_entries
19
- @all_entries ||= all_entries!
20
- end
21
-
22
- def all_entries!
23
- dir = @remote_path.directory? ? @remote_path + '*' : @remote_path
24
- Dir[dir].map do |entry|
25
- path = Pathname.new(entry)
26
- if path.file?
27
- FileSystemEntry.new(
28
- pathname: path.realpath.to_s,
29
- create_time: path.ctime,
30
- modified_time: path.mtime
31
- )
32
- end
33
- end.compact
34
- end
35
-
36
- private
37
-
38
- def init_local_file(*args, **kargs)
39
- end
40
-
41
- end
42
- end
43
- end
@@ -1,57 +0,0 @@
1
- module Remi
2
- module Extractor
3
-
4
- class S3File < FileSystem
5
-
6
- def initialize(*args, **kargs, &block)
7
- super
8
- init_s3_file(*args, **kargs, &block)
9
- end
10
-
11
- # Public: Called to extract files from the source filesystem.
12
- #
13
- # Returns an array with containing the paths to all files extracted.
14
- def extract
15
- entries.map do |entry|
16
- local_file = File.join(@local_path, entry.name)
17
- @logger.info "Downloading #{entry.pathname} from S3 to #{local_file}"
18
- File.open(local_file, 'wb') { |file| entry.raw.get(response_target: file) }
19
- local_file
20
- end
21
- end
22
-
23
- # Public: Returns an array of all FileSystemEntry instances that are in the remote_path.
24
- def all_entries
25
- @all_entries ||= all_entries!
26
- end
27
-
28
- def all_entries!
29
- # S3 does not track anything like a create time, so use last modified for both
30
- bucket.objects(prefix: @remote_path.to_s).map do |entry|
31
- FileSystemEntry.new(
32
- pathname: entry.key,
33
- create_time: entry.last_modified,
34
- modified_time: entry.last_modified,
35
- raw: entry
36
- )
37
- end
38
- end
39
-
40
- def s3_client
41
- @s3_client ||= Aws::S3::Client.new
42
- end
43
-
44
- private
45
-
46
- def init_s3_file(*args, bucket:, **kargs)
47
- @bucket_name = bucket
48
- end
49
-
50
- def bucket
51
- @bucket ||= Aws::S3::Bucket.new(@bucket_name, client: s3_client)
52
- end
53
-
54
- end
55
-
56
- end
57
- end
@@ -1,83 +0,0 @@
1
- module Remi
2
- module Extractor
3
-
4
- class SftpFile < FileSystem
5
-
6
- N_RETRY = 3
7
-
8
- def initialize(*args, **kargs)
9
- super
10
- init_sftp_file(*args, **kargs)
11
- end
12
-
13
- attr_reader :host
14
- attr_reader :username
15
- attr_reader :password
16
- attr_reader :port
17
-
18
- # Public: Called to extract files from the source filesystem.
19
- #
20
- # Returns an array with containing the paths to all files extracted.
21
- def extract
22
- connection do |sftp|
23
- entries.map do |entry|
24
- local_file = File.join(@local_path, entry.name)
25
- @logger.info "Downloading #{entry.name} to #{local_file}"
26
- retry_download { sftp.download!(File.join(@remote_path, entry.name), local_file) }
27
- local_file
28
- end
29
- end
30
- end
31
-
32
- # Public: Returns an array of all FileSystemEntry instances that are in the remote_path.
33
- def all_entries
34
- @all_entries ||= all_entries!
35
- end
36
-
37
- def all_entries!
38
- sftp_entries = connection { |sftp| sftp.dir.entries(@remote_path) }
39
- sftp_entries.map do |entry|
40
- # Early versions of the protocol don't support create time, fake it with modified time?
41
- FileSystemEntry.new(
42
- pathname: File.join(@remote_path, entry.name),
43
- create_time: entry.attributes.respond_to?(:createtime) ? entry.attributes.createtime : entry.attributes.mtime,
44
- modified_time: entry.attributes.mtime
45
- )
46
- end
47
- end
48
-
49
-
50
- private
51
-
52
- def init_sftp_file(*args, credentials:, **kargs)
53
- @host = credentials.fetch(:host)
54
- @username = credentials.fetch(:username)
55
- @password = credentials.fetch(:password)
56
- @port = credentials.fetch(:port, '22')
57
- end
58
-
59
- def connection(&block)
60
- result = nil
61
- Net::SFTP.start(@host, @username, password: @password, port: @port) do |sftp|
62
- result = yield sftp
63
- end
64
- result
65
- end
66
-
67
- def retry_download(&block)
68
- 1.upto(N_RETRY).each do |itry|
69
- begin
70
- block.call
71
- break
72
- rescue RuntimeError => err
73
- raise err unless itry < N_RETRY
74
- @logger.error "Download failed with error: #{err.message}"
75
- @logger.error "Retry attempt #{itry}/#{N_RETRY-1}"
76
- sleep(1)
77
- end
78
- end
79
- end
80
- end
81
-
82
- end
83
- end
@@ -1,79 +0,0 @@
1
- require_relative '../remi_spec'
2
-
3
- describe DataSource::CsvFile do
4
-
5
- it "converts a CSV into a dataframe" do
6
- csv = Remi::DataSource::CsvFile.new(
7
- extractor: 'spec/fixtures/basic.csv'
8
- )
9
-
10
- expected_df = Remi::DataFrame::Daru.new(
11
- {
12
- column_a: ['value 1A', 'value 2A'],
13
- column_b: ['value 1B', 'value 2B']
14
- }
15
- )
16
- expect(csv.df.to_a).to eq expected_df.to_a
17
- end
18
-
19
- it "adds filename when requested" do
20
- csv = Remi::DataSource::CsvFile.new(
21
- extractor: 'spec/fixtures/basic.csv',
22
- filename_field: :from_file
23
- )
24
-
25
- expected_files = [Pathname.new('spec/fixtures/basic.csv').realpath.to_s] * 2
26
- expect(csv.df[:from_file].to_a).to eq expected_files
27
- end
28
-
29
- it "preprocesses records when required" do
30
- csv = Remi::DataSource::CsvFile.new(
31
- extractor: 'spec/fixtures/unsupported_escape.csv',
32
- preprocessor: ->(line) { line.gsub(/\\"/,'""') }
33
- )
34
-
35
- expected_df = Remi::DataFrame::Daru.new(
36
- {
37
- column_a: ['value 1A', 'value 2A'],
38
- column_b: ['value "1B"', 'value "2B"']
39
- }
40
- )
41
- expect(csv.df.to_a).to eq expected_df.to_a
42
- end
43
-
44
- it "accepts standard Ruby CSV options" do
45
- csv = Remi::DataSource::CsvFile.new(
46
- extractor: 'spec/fixtures/basic.csv',
47
- preprocessor: ->(line) { line.gsub(/,/,'|') },
48
- csv_options: { col_sep: '|' }
49
- )
50
-
51
- expected_df = Remi::DataFrame::Daru.new(
52
- {
53
- column_a: ['value 1A', 'value 2A'],
54
- column_b: ['value 1B', 'value 2B']
55
- }
56
- )
57
- expect(csv.df.to_a).to eq expected_df.to_a
58
- end
59
-
60
- it "combines multiple csv files into a single dataframe" do
61
- csv = Remi::DataSource::CsvFile.new(
62
- extractor: Remi::Extractor::LocalFile.new(
63
- remote_path: 'spec/fixtures',
64
- pattern: /basic(|2)\.csv/
65
- )
66
- )
67
-
68
- expected_df = Remi::DataFrame::Daru.new(
69
- {
70
- column_a: ['value 1A', 'value 2A', 'value 1A', 'value 2A'],
71
- column_b: ['value 1B', 'value 2B', nil, nil],
72
- column_c: [nil, nil, 'value 1C', 'value 2C']
73
- }
74
- )
75
-
76
- expect(csv.df.to_a).to eq expected_df.to_a
77
- end
78
-
79
- end
@@ -1,27 +0,0 @@
1
- require_relative '../remi_spec'
2
-
3
- describe DataSource::DataFrame do
4
- it "converts data into a dataframe" do
5
- source_dataframe = Remi::DataSource::DataFrame.new(
6
- fields: {
7
- :col1 => {},
8
- :col2 => {}
9
- },
10
- data: [
11
- ['11', '12'],
12
- ['21', '22'],
13
- ['31', '32']
14
- ]
15
- )
16
-
17
- expected_df = Remi::DataFrame::Daru.new(
18
- {
19
- col1: ['11', '21', '31'],
20
- col2: ['12', '22', '32']
21
- }
22
- )
23
-
24
- expect(source_dataframe.df).to be_a Remi::DataFrame
25
- expect(source_dataframe.df.to_a).to eq expected_df.to_a
26
- end
27
- end