remi 0.2.42 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.yardopts +7 -0
- data/Gemfile +1 -1
- data/Gemfile.lock +13 -26
- data/README.md +1 -1
- data/features/step_definitions/remi_step.rb +33 -13
- data/features/sub_job_example.feature +24 -0
- data/features/sub_transform_example.feature +35 -0
- data/features/sub_transform_many_to_many.feature +49 -0
- data/features/support/env_app.rb +1 -1
- data/jobs/all_jobs_shared.rb +19 -16
- data/jobs/copy_source_job.rb +11 -9
- data/jobs/csv_file_target_job.rb +10 -9
- data/jobs/json_job.rb +18 -14
- data/jobs/metadata_job.rb +33 -28
- data/jobs/parameters_job.rb +14 -11
- data/jobs/sample_job.rb +106 -77
- data/jobs/sftp_file_target_job.rb +14 -13
- data/jobs/sub_job_example_job.rb +86 -0
- data/jobs/sub_transform_example_job.rb +43 -0
- data/jobs/sub_transform_many_to_many_job.rb +46 -0
- data/jobs/transforms/concatenate_job.rb +16 -12
- data/jobs/transforms/data_frame_sieve_job.rb +24 -19
- data/jobs/transforms/date_diff_job.rb +15 -11
- data/jobs/transforms/nvl_job.rb +16 -12
- data/jobs/transforms/parse_date_job.rb +17 -14
- data/jobs/transforms/partitioner_job.rb +27 -19
- data/jobs/transforms/prefix_job.rb +13 -10
- data/jobs/transforms/truncate_job.rb +14 -10
- data/jobs/transforms/truthy_job.rb +11 -8
- data/lib/remi.rb +25 -11
- data/lib/remi/data_frame.rb +4 -4
- data/lib/remi/data_frame/daru.rb +1 -37
- data/lib/remi/data_subject.rb +234 -48
- data/lib/remi/data_subjects/csv_file.rb +171 -0
- data/lib/remi/data_subjects/data_frame.rb +106 -0
- data/lib/remi/data_subjects/file_system.rb +115 -0
- data/lib/remi/data_subjects/local_file.rb +109 -0
- data/lib/remi/data_subjects/none.rb +31 -0
- data/lib/remi/data_subjects/postgres.rb +186 -0
- data/lib/remi/data_subjects/s3_file.rb +84 -0
- data/lib/remi/data_subjects/salesforce.rb +211 -0
- data/lib/remi/data_subjects/sftp_file.rb +196 -0
- data/lib/remi/data_subjects/sub_job.rb +50 -0
- data/lib/remi/dsl.rb +74 -0
- data/lib/remi/encoder.rb +45 -0
- data/lib/remi/extractor.rb +21 -0
- data/lib/remi/field_symbolizers.rb +1 -0
- data/lib/remi/job.rb +279 -113
- data/lib/remi/job/parameters.rb +90 -0
- data/lib/remi/job/sub_job.rb +35 -0
- data/lib/remi/job/transform.rb +165 -0
- data/lib/remi/loader.rb +22 -0
- data/lib/remi/monkeys/daru.rb +4 -0
- data/lib/remi/parser.rb +44 -0
- data/lib/remi/testing/business_rules.rb +17 -23
- data/lib/remi/testing/data_stub.rb +2 -2
- data/lib/remi/version.rb +1 -1
- data/remi.gemspec +3 -0
- data/spec/data_subject_spec.rb +475 -11
- data/spec/data_subjects/csv_file_spec.rb +69 -0
- data/spec/data_subjects/data_frame_spec.rb +52 -0
- data/spec/{extractor → data_subjects}/file_system_spec.rb +0 -0
- data/spec/{extractor → data_subjects}/local_file_spec.rb +0 -0
- data/spec/data_subjects/none_spec.rb +41 -0
- data/spec/data_subjects/postgres_spec.rb +80 -0
- data/spec/{extractor → data_subjects}/s3_file_spec.rb +0 -0
- data/spec/data_subjects/salesforce_spec.rb +117 -0
- data/spec/{extractor → data_subjects}/sftp_file_spec.rb +16 -0
- data/spec/data_subjects/sub_job_spec.rb +33 -0
- data/spec/encoder_spec.rb +38 -0
- data/spec/extractor_spec.rb +11 -0
- data/spec/fixtures/sf_bulk_helper_stubs.rb +443 -0
- data/spec/job/transform_spec.rb +257 -0
- data/spec/job_spec.rb +507 -0
- data/spec/loader_spec.rb +11 -0
- data/spec/parser_spec.rb +38 -0
- data/spec/sf_bulk_helper_spec.rb +117 -0
- data/spec/testing/data_stub_spec.rb +5 -3
- metadata +109 -27
- data/features/aggregate.feature +0 -42
- data/jobs/aggregate_job.rb +0 -31
- data/jobs/transforms/transform_jobs.rb +0 -4
- data/lib/remi/data_subject/csv_file.rb +0 -162
- data/lib/remi/data_subject/data_frame.rb +0 -52
- data/lib/remi/data_subject/postgres.rb +0 -134
- data/lib/remi/data_subject/salesforce.rb +0 -136
- data/lib/remi/data_subject/sftp_file.rb +0 -65
- data/lib/remi/extractor/file_system.rb +0 -92
- data/lib/remi/extractor/local_file.rb +0 -43
- data/lib/remi/extractor/s3_file.rb +0 -57
- data/lib/remi/extractor/sftp_file.rb +0 -83
- data/spec/data_subject/csv_file_spec.rb +0 -79
- data/spec/data_subject/data_frame.rb +0 -27
@@ -0,0 +1,171 @@
|
|
1
|
+
module Remi
|
2
|
+
|
3
|
+
# @api private
|
4
|
+
#
|
5
|
+
# Contains methods shared between CsvFile Parser/Encoder
|
6
|
+
module DataSubject::CsvFile
|
7
|
+
def self.included(base)
|
8
|
+
base.extend(CsvFileClassMethods)
|
9
|
+
end
|
10
|
+
|
11
|
+
module CsvFileClassMethods
|
12
|
+
def default_csv_options
|
13
|
+
@default_csv_options ||= CSV::DEFAULT_OPTIONS.merge({
|
14
|
+
headers: true,
|
15
|
+
header_converters: Remi::FieldSymbolizers[:standard],
|
16
|
+
converters: [],
|
17
|
+
col_sep: ',',
|
18
|
+
encoding: 'UTF-8',
|
19
|
+
quote_char: '"'
|
20
|
+
})
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
# @api public
|
26
|
+
#
|
27
|
+
# CsvFile parser
|
28
|
+
#
|
29
|
+
# @example
|
30
|
+
#
|
31
|
+
# class MyJob < Remi::Job
|
32
|
+
# source :some_file do
|
33
|
+
# extractor Remi::Extractor::LocalFile.new(
|
34
|
+
# remote_path: 'some_file.csv'
|
35
|
+
# )
|
36
|
+
# parser Remi::Parser::CsvFile.new(
|
37
|
+
# csv_options: {
|
38
|
+
# headers: true,
|
39
|
+
# col_sep: '|'
|
40
|
+
# }
|
41
|
+
# )
|
42
|
+
# end
|
43
|
+
# end
|
44
|
+
#
|
45
|
+
# job = MyJob.new
|
46
|
+
# job.some_file.df
|
47
|
+
# # =>#<Daru::DataFrame:70153153438500 @name = 4c59cfdd-7de7-4264-8666-83153f46a9e4 @size = 3>
|
48
|
+
# # id name
|
49
|
+
# # 0 1 Albert
|
50
|
+
# # 1 2 Betsy
|
51
|
+
# # 2 3 Camu
|
52
|
+
class Parser::CsvFile < Parser
|
53
|
+
include Remi::DataSubject::CsvFile
|
54
|
+
|
55
|
+
# @param csv_options [Hash] Standard Ruby CSV parsing options.
|
56
|
+
# @param filename_field [Symbol] Name of the field to be used to write
|
57
|
+
# the filename of the CSV being parsed (default: nil, meaning no field will be used)
|
58
|
+
# @param preprocessor [Proc] A proc used to pre-process lines of the CSV file before being parsed
|
59
|
+
def initialize(*args, **kargs, &block)
|
60
|
+
super
|
61
|
+
init_csv_file(*args, **kargs, &block)
|
62
|
+
end
|
63
|
+
|
64
|
+
# @return [Hash] Csv options hash
|
65
|
+
attr_reader :csv_options
|
66
|
+
|
67
|
+
# Converts a list of filenames into a dataframe after parsing them
|
68
|
+
# according ot the csv options that were set
|
69
|
+
# @param data [Object] Extracted data that needs to be parsed
|
70
|
+
# @return [Remi::DataFrame] The data converted into a dataframe
|
71
|
+
def parse(data)
|
72
|
+
# Assumes that each file has exactly the same structure
|
73
|
+
result_df = nil
|
74
|
+
Array(data).each_with_index do |filename, idx|
|
75
|
+
filename = filename.to_s
|
76
|
+
|
77
|
+
logger.info "Converting #{filename} to a dataframe"
|
78
|
+
processed_filename = preprocess(filename)
|
79
|
+
csv_df = Daru::DataFrame.from_csv processed_filename, @csv_options
|
80
|
+
|
81
|
+
csv_df[@filename_field] = Daru::Vector.new([filename] * csv_df.size, index: csv_df.index) if @filename_field
|
82
|
+
if idx == 0
|
83
|
+
result_df = csv_df
|
84
|
+
else
|
85
|
+
result_df = result_df.concat csv_df
|
86
|
+
end
|
87
|
+
end
|
88
|
+
|
89
|
+
Remi::DataFrame.create(:daru, result_df)
|
90
|
+
end
|
91
|
+
|
92
|
+
|
93
|
+
private
|
94
|
+
|
95
|
+
def preprocess(filename)
|
96
|
+
return filename unless @preprocessor
|
97
|
+
logger.info "Preprocessing #{filename}"
|
98
|
+
tmp_filename = File.join(Remi::Settings.work_dir, "#{Pathname.new(filename).basename}-#{SecureRandom.uuid}")
|
99
|
+
|
100
|
+
dirname = Pathname.new(tmp_filename).dirname
|
101
|
+
FileUtils.mkdir_p(dirname) unless File.directory? dirname
|
102
|
+
|
103
|
+
File.open(tmp_filename, 'w') do |outfile|
|
104
|
+
File.foreach(filename) do |in_line|
|
105
|
+
outfile.write @preprocessor.call(in_line)
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
tmp_filename
|
110
|
+
end
|
111
|
+
|
112
|
+
def init_csv_file(*args, csv_options: {}, filename_field: nil, preprocessor: nil, **kargs, &block)
|
113
|
+
@csv_options = self.class.default_csv_options.merge(csv_options)
|
114
|
+
@filename_field = filename_field
|
115
|
+
@preprocessor = preprocessor
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
|
120
|
+
|
121
|
+
|
122
|
+
# CsvFile Encoder
|
123
|
+
#
|
124
|
+
# @example
|
125
|
+
# class MyJob < Remi::Job
|
126
|
+
# target :my_target do
|
127
|
+
# encoder Remi::Encoder::CsvFile.new(
|
128
|
+
# csv_options: { col_sep: '|' }
|
129
|
+
# )
|
130
|
+
# loader Remi::Loader::LocalFile.new(
|
131
|
+
# path: 'test.csv'
|
132
|
+
# )
|
133
|
+
# end
|
134
|
+
# end
|
135
|
+
#
|
136
|
+
# my_df = Daru::DataFrame.new({ a: 1.upto(5).to_a, b: 6.upto(10) })
|
137
|
+
# job = MyJob.new
|
138
|
+
# job.my_target.df = my_df
|
139
|
+
# job.my_target.load
|
140
|
+
class Encoder::CsvFile < Encoder
|
141
|
+
include Remi::DataSubject::CsvFile
|
142
|
+
|
143
|
+
# @param work_path [String,Pathname] Path to a directory used to temporarily store CSV files (default: Settings.work_dir)
|
144
|
+
# @param csv_options [Hash] Standard Ruby CSV parser options.
|
145
|
+
def initialize(*args, **kargs, &block)
|
146
|
+
super
|
147
|
+
init_csv_file_encoder(*args, **kargs, &block)
|
148
|
+
end
|
149
|
+
|
150
|
+
default_csv_options[:row_sep] = "\n"
|
151
|
+
|
152
|
+
# @return [Hash] Csv options hash
|
153
|
+
attr_reader :csv_options
|
154
|
+
|
155
|
+
# Converts the dataframe to a CSV file stored in the local work directory.
|
156
|
+
#
|
157
|
+
# @param dataframe [Remi::DataFrame] The dataframe to be encoded
|
158
|
+
# @return [Object] The path to the file
|
159
|
+
def encode(dataframe)
|
160
|
+
logger.info "Writing CSV file to temporary location #{@working_file}"
|
161
|
+
dataframe.write_csv @working_file, @csv_options
|
162
|
+
@working_file
|
163
|
+
end
|
164
|
+
|
165
|
+
private
|
166
|
+
def init_csv_file_encoder(*args, work_path: Settings.work_dir, csv_options: {}, **kargs, &block)
|
167
|
+
@working_file = File.join(work_path, SecureRandom.uuid)
|
168
|
+
@csv_options = self.class.default_csv_options.merge(csv_options)
|
169
|
+
end
|
170
|
+
end
|
171
|
+
end
|
@@ -0,0 +1,106 @@
|
|
1
|
+
module Remi
|
2
|
+
|
3
|
+
# DataFrame extractor.
|
4
|
+
# This class is used to hard-code a dataframe as a simple array of rows.
|
5
|
+
#
|
6
|
+
# @example
|
7
|
+
#
|
8
|
+
# class MyJob < Remi::Job
|
9
|
+
# source :my_df do
|
10
|
+
# fields ({ id: {}, name: {}})
|
11
|
+
# extractor Remi::Extractor::DataFrame.new(
|
12
|
+
# data: [
|
13
|
+
# [1, 'Albert'],
|
14
|
+
# [2, 'Betsy'],
|
15
|
+
# [3, 'Camu']
|
16
|
+
# ]
|
17
|
+
# )
|
18
|
+
# parser Remi::Parser::DataFrame.new
|
19
|
+
# end
|
20
|
+
# end
|
21
|
+
#
|
22
|
+
# job = MyJob.new
|
23
|
+
# job.my_df.df.inspect
|
24
|
+
# # =>#<Daru::DataFrame:70153153438500 @name = 4c59cfdd-7de7-4264-8666-83153f46a9e4 @size = 3>
|
25
|
+
# # id name
|
26
|
+
# # 0 1 Albert
|
27
|
+
# # 1 2 Betsy
|
28
|
+
# # 2 3 Camu
|
29
|
+
class Extractor::DataFrame < Extractor
|
30
|
+
|
31
|
+
# @param data [Array<Array>] An array of arrays representing rows of a dataframe.
|
32
|
+
def initialize(*args, **kargs, &block)
|
33
|
+
super
|
34
|
+
init_data_frame_extractor(*args, **kargs, &block)
|
35
|
+
end
|
36
|
+
|
37
|
+
attr_accessor :data
|
38
|
+
|
39
|
+
# @return [Object] self
|
40
|
+
def extract
|
41
|
+
self
|
42
|
+
end
|
43
|
+
|
44
|
+
private
|
45
|
+
|
46
|
+
def init_data_frame_extractor(*args, data: [], **kargs, &block)
|
47
|
+
@data = data
|
48
|
+
end
|
49
|
+
|
50
|
+
end
|
51
|
+
|
52
|
+
# DataFrame parser.
|
53
|
+
# In order for the DataFrame::Extractor to be parsed correctly, fields must be defined
|
54
|
+
# on the data subject.
|
55
|
+
#
|
56
|
+
# @example
|
57
|
+
#
|
58
|
+
# class MyJob < Remi::Job
|
59
|
+
# source :my_df do
|
60
|
+
# fields ({ id: {}, name: {}})
|
61
|
+
# extractor Remi::Extractor::DataFrame.new(
|
62
|
+
# data: [
|
63
|
+
# [1, 'Albert'],
|
64
|
+
# [2, 'Betsy'],
|
65
|
+
# [3, 'Camu']
|
66
|
+
# ]
|
67
|
+
# )
|
68
|
+
# parser Remi::Parser::DataFrame.new
|
69
|
+
# end
|
70
|
+
# end
|
71
|
+
#
|
72
|
+
# job = MyJob.new
|
73
|
+
# job.my_df.df.inspect
|
74
|
+
# # =>#<Daru::DataFrame:70153153438500 @name = 4c59cfdd-7de7-4264-8666-83153f46a9e4 @size = 3>
|
75
|
+
# # id name
|
76
|
+
# # 0 1 Albert
|
77
|
+
# # 1 2 Betsy
|
78
|
+
# # 2 3 Camu
|
79
|
+
class Parser::DataFrame < Parser
|
80
|
+
# @param df_extract [Extractor::DataFrame] An object containing data extracted from memory
|
81
|
+
# @return [Remi::DataFrame] The data converted into a dataframe
|
82
|
+
def parse(df_extract)
|
83
|
+
Remi::DataFrame.create(:daru, df_extract.data.transpose, order: fields.keys)
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
# DataFrame encoder
|
88
|
+
class Encoder::DataFrame < Encoder
|
89
|
+
# @param data_frame [Remi::DataFrame] The data_frame to be encoded
|
90
|
+
# @return [Object] The data_frame
|
91
|
+
def encode(data_frame)
|
92
|
+
data_frame
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
# DataFrame loader
|
97
|
+
# Not sure this is needed, right?
|
98
|
+
# Maybe on SubJobs?
|
99
|
+
class Loader::DataFrame < Loader
|
100
|
+
# @param data [Encoder::Salesforce] Data that has been encoded appropriately to be loaded into the target
|
101
|
+
# @return [true] On success
|
102
|
+
def load(data)
|
103
|
+
true
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
@@ -0,0 +1,115 @@
|
|
1
|
+
module Remi
|
2
|
+
|
3
|
+
# Defines properties of an entry in a filesystem.
|
4
|
+
class Extractor::FileSystemEntry
|
5
|
+
# @param pathname [String] The path the file system entry
|
6
|
+
# @param create_time [Time] The time the entry was created
|
7
|
+
# @param modified_time [Time] The time the entry was last modified
|
8
|
+
# @param raw [Object] An object that captures all other aspects of the entry, native to system the entry lives on
|
9
|
+
def initialize(pathname:, create_time:, modified_time:, raw: nil)
|
10
|
+
@pathname = Pathname.new(pathname)
|
11
|
+
@create_time = create_time
|
12
|
+
@modified_time = modified_time
|
13
|
+
@raw = raw
|
14
|
+
end
|
15
|
+
|
16
|
+
attr_reader :pathname, :create_time, :modified_time, :raw
|
17
|
+
|
18
|
+
# @return [String] the base name of the entry
|
19
|
+
def name
|
20
|
+
@pathname.basename.to_s
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
|
25
|
+
# Parent class used to describe things that behave like file systems (e.g.,
|
26
|
+
# local file systems, ftp servers, S3 objects) to be used for extraction.
|
27
|
+
#
|
28
|
+
# @param remote_path [String] Path on the remote system that contains the files
|
29
|
+
# @param pattern [Regexp] Only files with a name that matches this regular
|
30
|
+
# expression are extracted
|
31
|
+
# @param local_path [String] Local path to put copies of extracted files
|
32
|
+
# @param most_recent_only [true,false] Only extract the most recent file
|
33
|
+
# that matches the given pattern
|
34
|
+
# @param group_by [Regexp] A regular expression used to group files together
|
35
|
+
# and only extract the most recent file in each group
|
36
|
+
# @param most_recent_by [Symbol] Indicates the FileSystemEntry property used to determine which
|
37
|
+
# file is the most recent(`:create_time` (default), `:modified_time`, `:name`)
|
38
|
+
|
39
|
+
class Extractor::FileSystem < Extractor
|
40
|
+
class FileNotFoundError < StandardError; end
|
41
|
+
|
42
|
+
def initialize(*args, **kargs, &block)
|
43
|
+
super
|
44
|
+
init_file_system(*args, **kargs)
|
45
|
+
end
|
46
|
+
|
47
|
+
attr_reader :remote_path
|
48
|
+
attr_reader :pattern
|
49
|
+
attr_reader :local_path
|
50
|
+
attr_reader :most_recent_only
|
51
|
+
attr_reader :group_by
|
52
|
+
attr_reader :most_recent_by
|
53
|
+
|
54
|
+
# Public: Called to extract files from the source filesystem.
|
55
|
+
#
|
56
|
+
# Returns an array with containing the paths to all files extracted.
|
57
|
+
def extract
|
58
|
+
raise NoMethodError, "#{__method__} not defined for#{self.class.name}"
|
59
|
+
end
|
60
|
+
|
61
|
+
# Public: Returns an array of all FileSystemEntry instances that are in the remote_path.
|
62
|
+
# NOTE: all_entries is responsible for matching the path using @remote_path
|
63
|
+
def all_entries
|
64
|
+
raise NoMethodError, "#{__method__} not defined for#{self.class.name}"
|
65
|
+
end
|
66
|
+
|
67
|
+
# Public: Returns just the entries that are to be extracted.
|
68
|
+
def entries
|
69
|
+
if @group_by
|
70
|
+
most_recent_matching_entry_in_group
|
71
|
+
elsif @most_recent_only
|
72
|
+
Array(most_recent_matching_entry)
|
73
|
+
else
|
74
|
+
matching_entries
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
def matching_entries
|
79
|
+
all_entries.select { |e| @pattern.match e.name }
|
80
|
+
end
|
81
|
+
|
82
|
+
def most_recent_matching_entry
|
83
|
+
matching_entries.sort_by { |e| e.send(@most_recent_by) }.reverse.first
|
84
|
+
end
|
85
|
+
|
86
|
+
def most_recent_matching_entry_in_group
|
87
|
+
entries_with_group = matching_entries.map do |entry|
|
88
|
+
match = entry.name.match(@group_by)
|
89
|
+
next unless match
|
90
|
+
|
91
|
+
group = match.to_a[1..-1]
|
92
|
+
{ group: group, entry: entry }
|
93
|
+
end.compact
|
94
|
+
sorted_entries_with_group = entries_with_group.sort_by { |e| [e[:group], e[:entry].send(@most_recent_by)] }.reverse
|
95
|
+
|
96
|
+
last_group = nil
|
97
|
+
sorted_entries_with_group.map do |entry|
|
98
|
+
next unless entry[:group] != last_group
|
99
|
+
last_group = entry[:group]
|
100
|
+
entry[:entry]
|
101
|
+
end.compact
|
102
|
+
end
|
103
|
+
|
104
|
+
private
|
105
|
+
|
106
|
+
def init_file_system(*args, remote_path:, pattern: /.*/, local_path: Settings.work_dir, most_recent_only: false, group_by: nil, most_recent_by: :create_time, **kargs, &block)
|
107
|
+
@remote_path = Pathname.new(remote_path)
|
108
|
+
@pattern = pattern
|
109
|
+
@local_path = Pathname.new(local_path)
|
110
|
+
@most_recent_only = most_recent_only
|
111
|
+
@group_by = group_by
|
112
|
+
@most_recent_by = most_recent_by
|
113
|
+
end
|
114
|
+
end
|
115
|
+
end
|
@@ -0,0 +1,109 @@
|
|
1
|
+
module Remi
|
2
|
+
|
3
|
+
# Local file extractor
|
4
|
+
# Used to "extract" a file from a local filesystem.
|
5
|
+
# Note that even though the file is local, we still use the parameter `remote_path`
|
6
|
+
# to indicate the path. This makes this class consistent with Remi::FileSystem.
|
7
|
+
#
|
8
|
+
# @example
|
9
|
+
#
|
10
|
+
# class MyJob < Remi::Job
|
11
|
+
# source :some_file do
|
12
|
+
# extractor Remi::Extractor::LocalFile.new(
|
13
|
+
# remote_path: 'some_file.csv'
|
14
|
+
# )
|
15
|
+
# parser Remi::Parser::CsvFile.new(
|
16
|
+
# csv_options: {
|
17
|
+
# headers: true,
|
18
|
+
# col_sep: '|'
|
19
|
+
# }
|
20
|
+
# )
|
21
|
+
# end
|
22
|
+
# end
|
23
|
+
#
|
24
|
+
# job = MyJob.new
|
25
|
+
# job.some_file.df
|
26
|
+
# # =>#<Daru::DataFrame:70153153438500 @name = 4c59cfdd-7de7-4264-8666-83153f46a9e4 @size = 3>
|
27
|
+
# # id name
|
28
|
+
# # 0 1 Albert
|
29
|
+
# # 1 2 Betsy
|
30
|
+
# # 2 3 Camu
|
31
|
+
class Extractor::LocalFile < Extractor::FileSystem
|
32
|
+
def initialize(*args, **kargs)
|
33
|
+
super
|
34
|
+
init_local_file(*args, **kargs)
|
35
|
+
end
|
36
|
+
|
37
|
+
# Called to extract files from the source filesystem.
|
38
|
+
# @return [Array<String>] An array of paths to a local copy of the files extacted
|
39
|
+
def extract
|
40
|
+
entries.map(&:pathname)
|
41
|
+
end
|
42
|
+
|
43
|
+
# @return [Array<Extractor::FileSystemEntry>] List of objects in the remote path
|
44
|
+
def all_entries
|
45
|
+
@all_entries ||= all_entries!
|
46
|
+
end
|
47
|
+
|
48
|
+
# @return [Array<Extractor::FileSystemEntry>] List of objects in the remote path
|
49
|
+
def all_entries!
|
50
|
+
dir = @remote_path.directory? ? @remote_path + '*' : @remote_path
|
51
|
+
Dir[dir].map do |entry|
|
52
|
+
path = Pathname.new(entry)
|
53
|
+
if path.file?
|
54
|
+
Extractor::FileSystemEntry.new(
|
55
|
+
pathname: path.realpath.to_s,
|
56
|
+
create_time: path.ctime,
|
57
|
+
modified_time: path.mtime
|
58
|
+
)
|
59
|
+
end
|
60
|
+
end.compact
|
61
|
+
end
|
62
|
+
|
63
|
+
private
|
64
|
+
|
65
|
+
def init_local_file(*args, **kargs)
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
|
70
|
+
# Local file loader
|
71
|
+
# Used to output files to a local filesystem
|
72
|
+
# @example
|
73
|
+
# class MyJob < Remi::Job
|
74
|
+
# target :my_target do
|
75
|
+
# encoder Remi::Encoder::CsvFile.new(
|
76
|
+
# csv_options: { col_sep: '|' }
|
77
|
+
# )
|
78
|
+
# loader Remi::Loader::LocalFile.new(
|
79
|
+
# path: 'test.csv'
|
80
|
+
# )
|
81
|
+
# end
|
82
|
+
# end
|
83
|
+
#
|
84
|
+
# my_df = Daru::DataFrame.new({ a: 1.upto(5).to_a, b: 6.upto(10) })
|
85
|
+
# job = MyJob.new
|
86
|
+
# job.my_target.df = my_df
|
87
|
+
# job.my_target.load
|
88
|
+
class Loader::LocalFile < Loader
|
89
|
+
def initialize(*args, **kargs)
|
90
|
+
super
|
91
|
+
init_local_file_loader(*args, **kargs)
|
92
|
+
end
|
93
|
+
|
94
|
+
# Moves the file from the temporary workspace to another local path
|
95
|
+
# @param data [Object] The path to the file in the temporary work location
|
96
|
+
# @return [true] On success
|
97
|
+
def load(data)
|
98
|
+
logger.info "Writing file #{@local_path}"
|
99
|
+
FileUtils.mv(data, @local_path)
|
100
|
+
end
|
101
|
+
|
102
|
+
|
103
|
+
private
|
104
|
+
|
105
|
+
def init_local_file_loader(*args, path:, **kargs)
|
106
|
+
@local_path = path
|
107
|
+
end
|
108
|
+
end
|
109
|
+
end
|