remi 0.2.42 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (94) hide show
  1. checksums.yaml +4 -4
  2. data/.yardopts +7 -0
  3. data/Gemfile +1 -1
  4. data/Gemfile.lock +13 -26
  5. data/README.md +1 -1
  6. data/features/step_definitions/remi_step.rb +33 -13
  7. data/features/sub_job_example.feature +24 -0
  8. data/features/sub_transform_example.feature +35 -0
  9. data/features/sub_transform_many_to_many.feature +49 -0
  10. data/features/support/env_app.rb +1 -1
  11. data/jobs/all_jobs_shared.rb +19 -16
  12. data/jobs/copy_source_job.rb +11 -9
  13. data/jobs/csv_file_target_job.rb +10 -9
  14. data/jobs/json_job.rb +18 -14
  15. data/jobs/metadata_job.rb +33 -28
  16. data/jobs/parameters_job.rb +14 -11
  17. data/jobs/sample_job.rb +106 -77
  18. data/jobs/sftp_file_target_job.rb +14 -13
  19. data/jobs/sub_job_example_job.rb +86 -0
  20. data/jobs/sub_transform_example_job.rb +43 -0
  21. data/jobs/sub_transform_many_to_many_job.rb +46 -0
  22. data/jobs/transforms/concatenate_job.rb +16 -12
  23. data/jobs/transforms/data_frame_sieve_job.rb +24 -19
  24. data/jobs/transforms/date_diff_job.rb +15 -11
  25. data/jobs/transforms/nvl_job.rb +16 -12
  26. data/jobs/transforms/parse_date_job.rb +17 -14
  27. data/jobs/transforms/partitioner_job.rb +27 -19
  28. data/jobs/transforms/prefix_job.rb +13 -10
  29. data/jobs/transforms/truncate_job.rb +14 -10
  30. data/jobs/transforms/truthy_job.rb +11 -8
  31. data/lib/remi.rb +25 -11
  32. data/lib/remi/data_frame.rb +4 -4
  33. data/lib/remi/data_frame/daru.rb +1 -37
  34. data/lib/remi/data_subject.rb +234 -48
  35. data/lib/remi/data_subjects/csv_file.rb +171 -0
  36. data/lib/remi/data_subjects/data_frame.rb +106 -0
  37. data/lib/remi/data_subjects/file_system.rb +115 -0
  38. data/lib/remi/data_subjects/local_file.rb +109 -0
  39. data/lib/remi/data_subjects/none.rb +31 -0
  40. data/lib/remi/data_subjects/postgres.rb +186 -0
  41. data/lib/remi/data_subjects/s3_file.rb +84 -0
  42. data/lib/remi/data_subjects/salesforce.rb +211 -0
  43. data/lib/remi/data_subjects/sftp_file.rb +196 -0
  44. data/lib/remi/data_subjects/sub_job.rb +50 -0
  45. data/lib/remi/dsl.rb +74 -0
  46. data/lib/remi/encoder.rb +45 -0
  47. data/lib/remi/extractor.rb +21 -0
  48. data/lib/remi/field_symbolizers.rb +1 -0
  49. data/lib/remi/job.rb +279 -113
  50. data/lib/remi/job/parameters.rb +90 -0
  51. data/lib/remi/job/sub_job.rb +35 -0
  52. data/lib/remi/job/transform.rb +165 -0
  53. data/lib/remi/loader.rb +22 -0
  54. data/lib/remi/monkeys/daru.rb +4 -0
  55. data/lib/remi/parser.rb +44 -0
  56. data/lib/remi/testing/business_rules.rb +17 -23
  57. data/lib/remi/testing/data_stub.rb +2 -2
  58. data/lib/remi/version.rb +1 -1
  59. data/remi.gemspec +3 -0
  60. data/spec/data_subject_spec.rb +475 -11
  61. data/spec/data_subjects/csv_file_spec.rb +69 -0
  62. data/spec/data_subjects/data_frame_spec.rb +52 -0
  63. data/spec/{extractor → data_subjects}/file_system_spec.rb +0 -0
  64. data/spec/{extractor → data_subjects}/local_file_spec.rb +0 -0
  65. data/spec/data_subjects/none_spec.rb +41 -0
  66. data/spec/data_subjects/postgres_spec.rb +80 -0
  67. data/spec/{extractor → data_subjects}/s3_file_spec.rb +0 -0
  68. data/spec/data_subjects/salesforce_spec.rb +117 -0
  69. data/spec/{extractor → data_subjects}/sftp_file_spec.rb +16 -0
  70. data/spec/data_subjects/sub_job_spec.rb +33 -0
  71. data/spec/encoder_spec.rb +38 -0
  72. data/spec/extractor_spec.rb +11 -0
  73. data/spec/fixtures/sf_bulk_helper_stubs.rb +443 -0
  74. data/spec/job/transform_spec.rb +257 -0
  75. data/spec/job_spec.rb +507 -0
  76. data/spec/loader_spec.rb +11 -0
  77. data/spec/parser_spec.rb +38 -0
  78. data/spec/sf_bulk_helper_spec.rb +117 -0
  79. data/spec/testing/data_stub_spec.rb +5 -3
  80. metadata +109 -27
  81. data/features/aggregate.feature +0 -42
  82. data/jobs/aggregate_job.rb +0 -31
  83. data/jobs/transforms/transform_jobs.rb +0 -4
  84. data/lib/remi/data_subject/csv_file.rb +0 -162
  85. data/lib/remi/data_subject/data_frame.rb +0 -52
  86. data/lib/remi/data_subject/postgres.rb +0 -134
  87. data/lib/remi/data_subject/salesforce.rb +0 -136
  88. data/lib/remi/data_subject/sftp_file.rb +0 -65
  89. data/lib/remi/extractor/file_system.rb +0 -92
  90. data/lib/remi/extractor/local_file.rb +0 -43
  91. data/lib/remi/extractor/s3_file.rb +0 -57
  92. data/lib/remi/extractor/sftp_file.rb +0 -83
  93. data/spec/data_subject/csv_file_spec.rb +0 -79
  94. data/spec/data_subject/data_frame.rb +0 -27
@@ -1,65 +0,0 @@
1
- module Remi
2
-
3
- class DataTarget::SftpFile < DataTarget
4
-
5
- def initialize(*args, **kargs, &block)
6
- super
7
- init_sftp_file(*args, **kargs, &block)
8
- end
9
-
10
- attr_reader :local_path
11
- attr_reader :remote_path
12
-
13
- # Public: Performs the load operation, regardless of whether it has
14
- # already executed.
15
- #
16
- # Returns true if the load operation was successful
17
- def load!
18
- @logger.info "Uploading #{@local_path} to #{@credentials[:username]}@#{@credentials[:host]}: #{@remote_path}"
19
- connection do |sftp|
20
- retry_upload { sftp.upload! @local_path, @remote_path }
21
- end
22
-
23
- true
24
- end
25
-
26
-
27
- private
28
-
29
- def init_sftp_file(*args, credentials:, local_path:, remote_path: File.basename(local_path), **kargs, &block)
30
- @credentials = credentials
31
- @local_path = local_path
32
- @remote_path = remote_path
33
- init_df
34
- end
35
-
36
- def init_df
37
- parameter_df = Daru::DataFrame.new(
38
- local_path: Array(@local_path),
39
- remote_path: Array(@remote_path)
40
- )
41
- self.df = parameter_df
42
- end
43
-
44
- def connection(&block)
45
- result = nil
46
- Net::SFTP.start(@credentials[:host], @credentials[:username], password: @credentials[:password], port: @credentials[:port] || '22') do |sftp|
47
- result = yield sftp
48
- end
49
- result
50
- end
51
-
52
- def retry_upload(ntry=2, &block)
53
- 1.upto(ntry).each do |itry|
54
- begin
55
- block.call
56
- rescue RuntimeError => err
57
- raise err unless itry < ntry
58
- @logger.error "Upload failed with error: #{err.message}"
59
- @logger.error "Retry attempt #{itry}/#{ntry-1}"
60
- sleep(1)
61
- end
62
- end
63
- end
64
- end
65
- end
@@ -1,92 +0,0 @@
1
- module Remi
2
- module Extractor
3
-
4
- class FileSystemEntry
5
- def initialize(pathname:, create_time:, modified_time:, raw: nil)
6
- @pathname = Pathname.new(pathname)
7
- @create_time = create_time
8
- @modified_time = modified_time
9
- @raw = raw
10
- end
11
-
12
- attr_reader :pathname, :create_time, :modified_time, :raw
13
-
14
- def name
15
- @pathname.basename.to_s
16
- end
17
- end
18
-
19
-
20
- class FileSystem
21
- class FileNotFoundError < StandardError; end
22
-
23
- def initialize(*args, remote_path:, pattern: /.*/, local_path: Settings.work_dir, most_recent_only: false, group_by: nil, most_recent_by: :create_time, logger: Remi::Settings.logger, **kargs, &block)
24
- @remote_path = Pathname.new(remote_path)
25
- @pattern = pattern
26
- @local_path = Pathname.new(local_path)
27
- @most_recent_only = most_recent_only
28
- @group_by = group_by
29
- @most_recent_by = most_recent_by
30
- @logger = logger
31
- end
32
-
33
- attr_reader :remote_path
34
- attr_reader :pattern
35
- attr_reader :local_path
36
- attr_reader :most_recent_only
37
- attr_reader :group_by
38
- attr_reader :most_recent_by
39
- attr_reader :logger
40
-
41
- # Public: Called to extract files from the source filesystem.
42
- #
43
- # Returns an array with containing the paths to all files extracted.
44
- def extract
45
- raise NoMethodError, "#{__method__} not defined for#{self.class.name}"
46
- end
47
-
48
- # Public: Returns an array of all FileSystemEntry instances that are in the remote_path.
49
- # NOTE: all_entries is responsible for matching the path using @remote_path
50
- def all_entries
51
- raise NoMethodError, "#{__method__} not defined for#{self.class.name}"
52
- end
53
-
54
- # Public: Returns just the entries that are to be extracted.
55
- def entries
56
- if @group_by
57
- most_recent_matching_entry_in_group
58
- elsif @most_recent_only
59
- Array(most_recent_matching_entry)
60
- else
61
- matching_entries
62
- end
63
- end
64
-
65
- def matching_entries
66
- all_entries.select { |e| @pattern.match e.name }
67
- end
68
-
69
- def most_recent_matching_entry
70
- matching_entries.sort_by { |e| e.send(@most_recent_by) }.reverse.first
71
- end
72
-
73
- def most_recent_matching_entry_in_group
74
- entries_with_group = matching_entries.map do |entry|
75
- match = entry.name.match(@group_by)
76
- next unless match
77
-
78
- group = match.to_a[1..-1]
79
- { group: group, entry: entry }
80
- end.compact
81
- sorted_entries_with_group = entries_with_group.sort_by { |e| [e[:group], e[:entry].send(@most_recent_by)] }.reverse
82
-
83
- last_group = nil
84
- sorted_entries_with_group.map do |entry|
85
- next unless entry[:group] != last_group
86
- last_group = entry[:group]
87
- entry[:entry]
88
- end.compact
89
- end
90
- end
91
- end
92
- end
@@ -1,43 +0,0 @@
1
- module Remi
2
- module Extractor
3
-
4
- class LocalFile < FileSystem
5
- def initialize(*args, **kargs)
6
- super
7
- init_local_file(*args, **kargs)
8
- end
9
-
10
- # Public: Called to extract files from the source filesystem.
11
- #
12
- # Returns an array with containing the paths to all files extracted.
13
- def extract
14
- entries.map(&:pathname)
15
- end
16
-
17
- # Public: Returns an array of all FileSystemEntry instances that are in the remote_path.
18
- def all_entries
19
- @all_entries ||= all_entries!
20
- end
21
-
22
- def all_entries!
23
- dir = @remote_path.directory? ? @remote_path + '*' : @remote_path
24
- Dir[dir].map do |entry|
25
- path = Pathname.new(entry)
26
- if path.file?
27
- FileSystemEntry.new(
28
- pathname: path.realpath.to_s,
29
- create_time: path.ctime,
30
- modified_time: path.mtime
31
- )
32
- end
33
- end.compact
34
- end
35
-
36
- private
37
-
38
- def init_local_file(*args, **kargs)
39
- end
40
-
41
- end
42
- end
43
- end
@@ -1,57 +0,0 @@
1
- module Remi
2
- module Extractor
3
-
4
- class S3File < FileSystem
5
-
6
- def initialize(*args, **kargs, &block)
7
- super
8
- init_s3_file(*args, **kargs, &block)
9
- end
10
-
11
- # Public: Called to extract files from the source filesystem.
12
- #
13
- # Returns an array with containing the paths to all files extracted.
14
- def extract
15
- entries.map do |entry|
16
- local_file = File.join(@local_path, entry.name)
17
- @logger.info "Downloading #{entry.pathname} from S3 to #{local_file}"
18
- File.open(local_file, 'wb') { |file| entry.raw.get(response_target: file) }
19
- local_file
20
- end
21
- end
22
-
23
- # Public: Returns an array of all FileSystemEntry instances that are in the remote_path.
24
- def all_entries
25
- @all_entries ||= all_entries!
26
- end
27
-
28
- def all_entries!
29
- # S3 does not track anything like a create time, so use last modified for both
30
- bucket.objects(prefix: @remote_path.to_s).map do |entry|
31
- FileSystemEntry.new(
32
- pathname: entry.key,
33
- create_time: entry.last_modified,
34
- modified_time: entry.last_modified,
35
- raw: entry
36
- )
37
- end
38
- end
39
-
40
- def s3_client
41
- @s3_client ||= Aws::S3::Client.new
42
- end
43
-
44
- private
45
-
46
- def init_s3_file(*args, bucket:, **kargs)
47
- @bucket_name = bucket
48
- end
49
-
50
- def bucket
51
- @bucket ||= Aws::S3::Bucket.new(@bucket_name, client: s3_client)
52
- end
53
-
54
- end
55
-
56
- end
57
- end
@@ -1,83 +0,0 @@
1
- module Remi
2
- module Extractor
3
-
4
- class SftpFile < FileSystem
5
-
6
- N_RETRY = 3
7
-
8
- def initialize(*args, **kargs)
9
- super
10
- init_sftp_file(*args, **kargs)
11
- end
12
-
13
- attr_reader :host
14
- attr_reader :username
15
- attr_reader :password
16
- attr_reader :port
17
-
18
- # Public: Called to extract files from the source filesystem.
19
- #
20
- # Returns an array with containing the paths to all files extracted.
21
- def extract
22
- connection do |sftp|
23
- entries.map do |entry|
24
- local_file = File.join(@local_path, entry.name)
25
- @logger.info "Downloading #{entry.name} to #{local_file}"
26
- retry_download { sftp.download!(File.join(@remote_path, entry.name), local_file) }
27
- local_file
28
- end
29
- end
30
- end
31
-
32
- # Public: Returns an array of all FileSystemEntry instances that are in the remote_path.
33
- def all_entries
34
- @all_entries ||= all_entries!
35
- end
36
-
37
- def all_entries!
38
- sftp_entries = connection { |sftp| sftp.dir.entries(@remote_path) }
39
- sftp_entries.map do |entry|
40
- # Early versions of the protocol don't support create time, fake it with modified time?
41
- FileSystemEntry.new(
42
- pathname: File.join(@remote_path, entry.name),
43
- create_time: entry.attributes.respond_to?(:createtime) ? entry.attributes.createtime : entry.attributes.mtime,
44
- modified_time: entry.attributes.mtime
45
- )
46
- end
47
- end
48
-
49
-
50
- private
51
-
52
- def init_sftp_file(*args, credentials:, **kargs)
53
- @host = credentials.fetch(:host)
54
- @username = credentials.fetch(:username)
55
- @password = credentials.fetch(:password)
56
- @port = credentials.fetch(:port, '22')
57
- end
58
-
59
- def connection(&block)
60
- result = nil
61
- Net::SFTP.start(@host, @username, password: @password, port: @port) do |sftp|
62
- result = yield sftp
63
- end
64
- result
65
- end
66
-
67
- def retry_download(&block)
68
- 1.upto(N_RETRY).each do |itry|
69
- begin
70
- block.call
71
- break
72
- rescue RuntimeError => err
73
- raise err unless itry < N_RETRY
74
- @logger.error "Download failed with error: #{err.message}"
75
- @logger.error "Retry attempt #{itry}/#{N_RETRY-1}"
76
- sleep(1)
77
- end
78
- end
79
- end
80
- end
81
-
82
- end
83
- end
@@ -1,79 +0,0 @@
1
- require_relative '../remi_spec'
2
-
3
- describe DataSource::CsvFile do
4
-
5
- it "converts a CSV into a dataframe" do
6
- csv = Remi::DataSource::CsvFile.new(
7
- extractor: 'spec/fixtures/basic.csv'
8
- )
9
-
10
- expected_df = Remi::DataFrame::Daru.new(
11
- {
12
- column_a: ['value 1A', 'value 2A'],
13
- column_b: ['value 1B', 'value 2B']
14
- }
15
- )
16
- expect(csv.df.to_a).to eq expected_df.to_a
17
- end
18
-
19
- it "adds filename when requested" do
20
- csv = Remi::DataSource::CsvFile.new(
21
- extractor: 'spec/fixtures/basic.csv',
22
- filename_field: :from_file
23
- )
24
-
25
- expected_files = [Pathname.new('spec/fixtures/basic.csv').realpath.to_s] * 2
26
- expect(csv.df[:from_file].to_a).to eq expected_files
27
- end
28
-
29
- it "preprocesses records when required" do
30
- csv = Remi::DataSource::CsvFile.new(
31
- extractor: 'spec/fixtures/unsupported_escape.csv',
32
- preprocessor: ->(line) { line.gsub(/\\"/,'""') }
33
- )
34
-
35
- expected_df = Remi::DataFrame::Daru.new(
36
- {
37
- column_a: ['value 1A', 'value 2A'],
38
- column_b: ['value "1B"', 'value "2B"']
39
- }
40
- )
41
- expect(csv.df.to_a).to eq expected_df.to_a
42
- end
43
-
44
- it "accepts standard Ruby CSV options" do
45
- csv = Remi::DataSource::CsvFile.new(
46
- extractor: 'spec/fixtures/basic.csv',
47
- preprocessor: ->(line) { line.gsub(/,/,'|') },
48
- csv_options: { col_sep: '|' }
49
- )
50
-
51
- expected_df = Remi::DataFrame::Daru.new(
52
- {
53
- column_a: ['value 1A', 'value 2A'],
54
- column_b: ['value 1B', 'value 2B']
55
- }
56
- )
57
- expect(csv.df.to_a).to eq expected_df.to_a
58
- end
59
-
60
- it "combines multiple csv files into a single dataframe" do
61
- csv = Remi::DataSource::CsvFile.new(
62
- extractor: Remi::Extractor::LocalFile.new(
63
- remote_path: 'spec/fixtures',
64
- pattern: /basic(|2)\.csv/
65
- )
66
- )
67
-
68
- expected_df = Remi::DataFrame::Daru.new(
69
- {
70
- column_a: ['value 1A', 'value 2A', 'value 1A', 'value 2A'],
71
- column_b: ['value 1B', 'value 2B', nil, nil],
72
- column_c: [nil, nil, 'value 1C', 'value 2C']
73
- }
74
- )
75
-
76
- expect(csv.df.to_a).to eq expected_df.to_a
77
- end
78
-
79
- end
@@ -1,27 +0,0 @@
1
- require_relative '../remi_spec'
2
-
3
- describe DataSource::DataFrame do
4
- it "converts data into a dataframe" do
5
- source_dataframe = Remi::DataSource::DataFrame.new(
6
- fields: {
7
- :col1 => {},
8
- :col2 => {}
9
- },
10
- data: [
11
- ['11', '12'],
12
- ['21', '22'],
13
- ['31', '32']
14
- ]
15
- )
16
-
17
- expected_df = Remi::DataFrame::Daru.new(
18
- {
19
- col1: ['11', '21', '31'],
20
- col2: ['12', '22', '32']
21
- }
22
- )
23
-
24
- expect(source_dataframe.df).to be_a Remi::DataFrame
25
- expect(source_dataframe.df.to_a).to eq expected_df.to_a
26
- end
27
- end