hasta 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.cane +1 -0
- data/.gitignore +3 -0
- data/.rspec +2 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +20 -0
- data/README.md +150 -0
- data/Rakefile +15 -0
- data/hasta.gemspec +29 -0
- data/lib/hasta.rb +46 -0
- data/lib/hasta/cached_s3_file.rb +21 -0
- data/lib/hasta/combined_data_source.rb +35 -0
- data/lib/hasta/combined_storage.rb +30 -0
- data/lib/hasta/configuration.rb +88 -0
- data/lib/hasta/emr_job_definition.rb +104 -0
- data/lib/hasta/emr_node.rb +103 -0
- data/lib/hasta/env.rb +35 -0
- data/lib/hasta/execution_context.rb +90 -0
- data/lib/hasta/filter.rb +40 -0
- data/lib/hasta/filtered_s3_file.rb +34 -0
- data/lib/hasta/identity_mapper.rb +17 -0
- data/lib/hasta/identity_reducer.rb +18 -0
- data/lib/hasta/in_memory_data_sink.rb +40 -0
- data/lib/hasta/in_memory_data_source.rb +35 -0
- data/lib/hasta/interpolate_string.rb +45 -0
- data/lib/hasta/local_file_path.rb +12 -0
- data/lib/hasta/local_storage.rb +41 -0
- data/lib/hasta/mapper.rb +23 -0
- data/lib/hasta/reducer.rb +29 -0
- data/lib/hasta/resolve_cached_s3_file.rb +29 -0
- data/lib/hasta/resolve_filtered_s3_file.rb +22 -0
- data/lib/hasta/runner.rb +32 -0
- data/lib/hasta/s3_data_sink.rb +48 -0
- data/lib/hasta/s3_data_source.rb +41 -0
- data/lib/hasta/s3_file.rb +56 -0
- data/lib/hasta/s3_file_cache.rb +23 -0
- data/lib/hasta/s3_storage.rb +21 -0
- data/lib/hasta/s3_uri.rb +60 -0
- data/lib/hasta/sorted_data_source.rb +36 -0
- data/lib/hasta/storage.rb +82 -0
- data/lib/hasta/tasks.rb +8 -0
- data/lib/hasta/tasks/runner.rb +84 -0
- data/lib/hasta/version.rb +3 -0
- data/spec/fixtures/hasta/filter_config.txt +1 -0
- data/spec/fixtures/hasta/json/emr_node.json +10 -0
- data/spec/fixtures/hasta/json/pipeline_definition.json +135 -0
- data/spec/fixtures/hasta/lib/failing_mapper.rb +19 -0
- data/spec/fixtures/hasta/lib/test_env_mapper.rb +20 -0
- data/spec/fixtures/hasta/lib/test_identity_mapper.rb +20 -0
- data/spec/fixtures/hasta/lib/test_types_mapper.rb +21 -0
- data/spec/fixtures/hasta/lib/types.rb +1 -0
- data/spec/fixtures/hasta/lib/unconventional_reducer.rb +17 -0
- data/spec/hasta/combined_data_source_spec.rb +25 -0
- data/spec/hasta/combined_storage_spec.rb +54 -0
- data/spec/hasta/configuration_spec.rb +49 -0
- data/spec/hasta/emr_job_definition_spec.rb +181 -0
- data/spec/hasta/emr_node_spec.rb +32 -0
- data/spec/hasta/env_spec.rb +30 -0
- data/spec/hasta/execution_context_spec.rb +67 -0
- data/spec/hasta/filter_spec.rb +66 -0
- data/spec/hasta/filtered_s3_file_spec.rb +45 -0
- data/spec/hasta/identity_mapper_spec.rb +22 -0
- data/spec/hasta/identity_reducer_spec.rb +20 -0
- data/spec/hasta/interpolate_string_spec.rb +44 -0
- data/spec/hasta/local_file_path_spec.rb +18 -0
- data/spec/hasta/local_storage_spec.rb +52 -0
- data/spec/hasta/mapper_spec.rb +26 -0
- data/spec/hasta/reducer_spec.rb +26 -0
- data/spec/hasta/resolved_cached_s3_file_spec.rb +68 -0
- data/spec/hasta/s3_data_source_spec.rb +39 -0
- data/spec/hasta/s3_file_cache_spec.rb +45 -0
- data/spec/hasta/s3_file_spec.rb +122 -0
- data/spec/hasta/s3_storage_spec.rb +24 -0
- data/spec/hasta/s3_uri_spec.rb +151 -0
- data/spec/hasta/sorted_data_source_spec.rb +22 -0
- data/spec/spec_helper.rb +24 -0
- data/spec/support/shared_contexts/hasta/local_fog_storage.rb +17 -0
- data/spec/support/shared_examples/hasta/storage_examples.rb +103 -0
- metadata +254 -0
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
# Copyright Swipely, Inc. All rights reserved.
|
|
2
|
+
|
|
3
|
+
require 'forwardable'
|
|
4
|
+
require 'json'
|
|
5
|
+
|
|
6
|
+
require 'hasta/emr_node'
|
|
7
|
+
require 'hasta/env'
|
|
8
|
+
require 'hasta/identity_mapper'
|
|
9
|
+
require 'hasta/mapper'
|
|
10
|
+
require 'hasta/reducer'
|
|
11
|
+
require 'hasta/identity_reducer'
|
|
12
|
+
require 'hasta/s3_data_source'
|
|
13
|
+
require 'hasta/s3_uri'
|
|
14
|
+
require 'hasta/s3_data_sink'
|
|
15
|
+
|
|
16
|
+
module Hasta
|
|
17
|
+
# Defines the EMR job that is being tested
|
|
18
|
+
class EmrJobDefinition
|
|
19
|
+
extend Forwardable
|
|
20
|
+
|
|
21
|
+
def self.load(file_path, id, scheduled_start_time = Time.now)
|
|
22
|
+
emr_node = JSON.parse(File.read(file_path))['objects'].find { |node|
|
|
23
|
+
node['type'] == 'EmrActivity' && node['id'] == id
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
raise ArgumentError, "No EmrActivity for id: #{id} in file: #{file_path}" unless emr_node
|
|
27
|
+
new(EmrNode.from_json(emr_node, scheduled_start_time))
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def_delegators :emr_node, :id
|
|
31
|
+
|
|
32
|
+
def initialize(emr_node)
|
|
33
|
+
@emr_node = emr_node
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
def input_paths
|
|
37
|
+
@input_paths ||= emr_node.input_paths.map { |path| S3URI.parse(path) }
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
def output_path
|
|
41
|
+
@output_path ||= S3URI.parse(emr_node.output_path)
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
def env
|
|
45
|
+
@env ||= Env.new(
|
|
46
|
+
emr_node.env,
|
|
47
|
+
Hash[
|
|
48
|
+
emr_node.
|
|
49
|
+
cache_files.
|
|
50
|
+
reject { |tag, uri| uri.end_with?('.rb') }.
|
|
51
|
+
map { |tag, uri| ["#{tag.split('.').first.upcase}_FILE_PATH", S3URI.parse(uri)] }
|
|
52
|
+
]
|
|
53
|
+
)
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
def ruby_files
|
|
57
|
+
@ruby_files ||= emr_node.
|
|
58
|
+
cache_files.
|
|
59
|
+
values.
|
|
60
|
+
select { |uri| uri.end_with?('.rb') }.
|
|
61
|
+
map { |uri| local_path_to_step_file(S3URI.parse(uri)) }
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
def mapper
|
|
65
|
+
@mapper ||= parse_mapper(emr_node.mapper)
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
def reducer
|
|
69
|
+
@reducer ||= parse_reducer(emr_node.reducer)
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
def data_sources
|
|
73
|
+
@data_sources ||= input_paths.map { |path| S3DataSource.new(path) }
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
def data_sink
|
|
77
|
+
@data_sink ||= S3DataSink.new(output_path)
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
private
|
|
81
|
+
|
|
82
|
+
attr_reader :emr_node
|
|
83
|
+
|
|
84
|
+
def local_path_to_step_file(s3_uri)
|
|
85
|
+
File.join(Hasta.project_root, Hasta.project_steps, s3_uri.basename)
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
def parse_mapper(mapper_command)
|
|
89
|
+
if %w[cat org.apache.hadoop.mapred.lib.IdentityMapper].include?(mapper_command)
|
|
90
|
+
IdentityMapper
|
|
91
|
+
else
|
|
92
|
+
Mapper.new(local_path_to_step_file(S3URI.parse(mapper_command)))
|
|
93
|
+
end
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
def parse_reducer(reducer_command)
|
|
97
|
+
if %w[cat org.apache.hadoop.mapred.lib.IdentityReducer].include?(reducer_command)
|
|
98
|
+
IdentityReducer
|
|
99
|
+
else
|
|
100
|
+
Reducer.new(local_path_to_step_file(S3URI.parse(reducer_command)))
|
|
101
|
+
end
|
|
102
|
+
end
|
|
103
|
+
end
|
|
104
|
+
end
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
# Copyright Swipely, Inc. All rights reserved.
|
|
2
|
+
|
|
3
|
+
require 'hasta/interpolate_string'
|
|
4
|
+
|
|
5
|
+
module Hasta
|
|
6
|
+
# Models the Amazon Data Pipeline configuration details for the EMR job that is being tested
|
|
7
|
+
class EmrNode
|
|
8
|
+
class << self
|
|
9
|
+
def from_json(json, scheduled_start_time = Time.now)
|
|
10
|
+
command_line = parse_step_line(json['step'])
|
|
11
|
+
|
|
12
|
+
new(
|
|
13
|
+
:id => json['id'],
|
|
14
|
+
:input_paths => command_line['input'],
|
|
15
|
+
:output_path => command_line['output'].first,
|
|
16
|
+
:mapper => command_line['mapper'].first,
|
|
17
|
+
:reducer => command_line['reducer'].first,
|
|
18
|
+
:cache_files => command_line['cacheFile'],
|
|
19
|
+
:env => command_line['cmdenv'],
|
|
20
|
+
:scheduled_start_time => scheduled_start_time
|
|
21
|
+
)
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
private
|
|
25
|
+
|
|
26
|
+
# Parses the 'step' attribute of an EMR configuration into a Hash
|
|
27
|
+
# Sample step line:
|
|
28
|
+
# "/home/hadoop/contrib/streaming/hadoop-streaming.jar,
|
|
29
|
+
# -input,s3n://data-bucket/input1/,
|
|
30
|
+
# -output,s3://data-bucket/output/,
|
|
31
|
+
# -mapper,cat,
|
|
32
|
+
# -reducer,s3n://steps-bucket/path/to/reducer.rb,
|
|
33
|
+
# -cacheFile,s3://data-bucket/path/to/mappings.yml#mappings.yml,
|
|
34
|
+
# -cacheFile,s3://data-bucket/path/to/ignored.yml#ignored.yml,
|
|
35
|
+
# -cmdenv,API_KEY=123456,
|
|
36
|
+
# -cmdenv,ENVIRONMENT_NAME=uat"
|
|
37
|
+
#
|
|
38
|
+
# Sample output:
|
|
39
|
+
# {
|
|
40
|
+
# "input" => ["s3n://data-bucket/input1/"],
|
|
41
|
+
# "output"=> ["s3://data-bucket/output/"],
|
|
42
|
+
# "mapper => ["cat"],
|
|
43
|
+
# "reducer" => ["s3n://steps-bucket/path/to/reducer.rb"],
|
|
44
|
+
# "cacheFile" => ["s3://data-bucket/path/to/mappings.yml#mappings.yml",
|
|
45
|
+
# "s3://data-bucket/path/to/ignored.yml#ignored.yml"],
|
|
46
|
+
# "cmdenv" => ["API_KEY=123456", "ENVIRONMENT_NAME=uat"]
|
|
47
|
+
# }
|
|
48
|
+
#
|
|
49
|
+
def parse_step_line(step)
|
|
50
|
+
parsed = Hash.new { |h, k| h[k] = [] }
|
|
51
|
+
step.
|
|
52
|
+
split(',-').
|
|
53
|
+
drop(1).
|
|
54
|
+
map { |value| i = value.index(','); [value[0...i], value[i+1..-1]] }.
|
|
55
|
+
each do |switch, arg|
|
|
56
|
+
parsed[switch] << arg
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
parsed
|
|
60
|
+
end
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
def initialize(attributes)
|
|
64
|
+
@attributes = attributes
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
def id
|
|
68
|
+
attributes[:id]
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
def input_paths
|
|
72
|
+
@input_path ||= attributes[:input_paths].map { |path| interpolate(path) }
|
|
73
|
+
end
|
|
74
|
+
|
|
75
|
+
def output_path
|
|
76
|
+
@output_path ||= interpolate(attributes[:output_path])
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
def mapper
|
|
80
|
+
attributes[:mapper]
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
def reducer
|
|
84
|
+
attributes[:reducer]
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
def cache_files
|
|
88
|
+
@cache_files ||= Hash[attributes[:cache_files].map { |value| value.split('#').reverse }]
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
def env
|
|
92
|
+
@env ||= Hash[attributes[:env].map { |value| value.split('=') }]
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
private
|
|
96
|
+
|
|
97
|
+
attr_reader :attributes
|
|
98
|
+
|
|
99
|
+
def interpolate(path)
|
|
100
|
+
InterpolateString.evaluate(path, 'scheduledStartTime' => attributes[:scheduled_start_time])
|
|
101
|
+
end
|
|
102
|
+
end
|
|
103
|
+
end
|
data/lib/hasta/env.rb
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
# Copyright Swipely, Inc. All rights reserved.
|
|
2
|
+
|
|
3
|
+
require 'hasta/local_file_path'
|
|
4
|
+
require 'hasta/s3_data_source'
|
|
5
|
+
|
|
6
|
+
module Hasta
|
|
7
|
+
# Constructs the ENV variables required to run a local EMR job
|
|
8
|
+
class Env
|
|
9
|
+
attr_reader :variables, :files
|
|
10
|
+
|
|
11
|
+
def initialize(
|
|
12
|
+
variables = {},
|
|
13
|
+
files = {},
|
|
14
|
+
combined_storage = Hasta.combined_storage
|
|
15
|
+
)
|
|
16
|
+
@variables = variables
|
|
17
|
+
@files = files
|
|
18
|
+
@combined_storage = combined_storage
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
def setup
|
|
22
|
+
file_vars = {}
|
|
23
|
+
files.each do |key, s3_uri|
|
|
24
|
+
input_source = S3DataSource.new(s3_uri, combined_storage)
|
|
25
|
+
file_vars[key] = LocalFilePath.for(combined_storage.write(s3_uri, input_source))
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
variables.merge(file_vars)
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
private
|
|
32
|
+
|
|
33
|
+
attr_reader :combined_storage
|
|
34
|
+
end
|
|
35
|
+
end
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
# Copyright Swipely, Inc. All rights reserved.
|
|
2
|
+
|
|
3
|
+
require 'open3'
|
|
4
|
+
|
|
5
|
+
require 'hasta/s3_data_sink'
|
|
6
|
+
|
|
7
|
+
module Hasta
|
|
8
|
+
# Executes each local EMR job in isolation
|
|
9
|
+
class ExecutionContext
|
|
10
|
+
# A Subprocess
|
|
11
|
+
class Subprocess
|
|
12
|
+
attr_reader :stdin, :stdout, :stderr
|
|
13
|
+
|
|
14
|
+
def initialize(ruby_files, env)
|
|
15
|
+
@ruby_files = ruby_files
|
|
16
|
+
@env = env
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
def start(source_file)
|
|
20
|
+
Open3.popen3(*cmd_line(source_file)) do |stdin, stdout, stderr, wait_thr|
|
|
21
|
+
@stdin, @stdout, @stderr, @wait_thr = stdin, stdout, stderr, wait_thr
|
|
22
|
+
|
|
23
|
+
yield self
|
|
24
|
+
|
|
25
|
+
if (exit_code = wait_thr.value.exitstatus) != 0
|
|
26
|
+
raise ExecutionError, "#{source_file} exited with non-zero status: #{exit_code}"
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
private
|
|
32
|
+
|
|
33
|
+
attr_reader :source_file, :env, :ruby_files
|
|
34
|
+
|
|
35
|
+
def cmd_line(source_file)
|
|
36
|
+
[env, ruby_exe_path] + load_path + [source_file]
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
def ruby_exe_path
|
|
40
|
+
File.join(RbConfig::CONFIG['bindir'], RbConfig::CONFIG['ruby_install_name'])
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def load_path
|
|
44
|
+
ruby_files.
|
|
45
|
+
map { |file| File.expand_path(File.dirname(file)) }.
|
|
46
|
+
uniq.
|
|
47
|
+
map { |path| ['-I', path] }.
|
|
48
|
+
flatten
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
def initialize(ruby_files = [], env = {})
|
|
53
|
+
@sub_process = Subprocess.new(ruby_files, env)
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
def execute(source_file, data_source, data_sink)
|
|
57
|
+
sub_process.start(source_file) do |sub_process|
|
|
58
|
+
[
|
|
59
|
+
stream_input(data_source, sub_process.stdin),
|
|
60
|
+
stream_output(sub_process.stdout) { |line| data_sink << line },
|
|
61
|
+
stream_output(sub_process.stderr) { |line| Hasta.logger.error line },
|
|
62
|
+
].each(&:join)
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
data_sink.close
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
private
|
|
69
|
+
|
|
70
|
+
attr_reader :sub_process
|
|
71
|
+
|
|
72
|
+
def stream_input(data_source, io)
|
|
73
|
+
Thread.new do
|
|
74
|
+
data_source.each_line do |line|
|
|
75
|
+
io.puts line
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
io.close_write
|
|
79
|
+
end
|
|
80
|
+
end
|
|
81
|
+
|
|
82
|
+
def stream_output(io)
|
|
83
|
+
Thread.new do
|
|
84
|
+
StringIO.new(io.read).each_line do |line|
|
|
85
|
+
yield line.rstrip
|
|
86
|
+
end
|
|
87
|
+
end
|
|
88
|
+
end
|
|
89
|
+
end
|
|
90
|
+
end
|
data/lib/hasta/filter.rb
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
# Copyright Swipely, Inc. All rights reserved.
|
|
2
|
+
|
|
3
|
+
require 'set'
|
|
4
|
+
|
|
5
|
+
module Hasta
|
|
6
|
+
# The filter that is used to drop unwanted lines from input files
|
|
7
|
+
class Filter
|
|
8
|
+
def self.from_file(file)
|
|
9
|
+
if lines = File.read(file)
|
|
10
|
+
Hasta.logger.debug "Loading data filter file: #{File.expand_path(file)}"
|
|
11
|
+
new(*lines.split("\n").map { |line| Regexp.new(line) })
|
|
12
|
+
end
|
|
13
|
+
rescue => ex
|
|
14
|
+
raise ConfigurationError.new,
|
|
15
|
+
"Failed to load filter configuration file: #{file} - #{ex.message}"
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def initialize(*accept_regexes)
|
|
19
|
+
@accept_regexes = Set.new(accept_regexes)
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
def include?(line)
|
|
23
|
+
to_proc.call(line)
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def to_proc
|
|
27
|
+
@proc ||= Proc.new { |line| !!(accept_regexes.find { |regex| line =~ regex }) }
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def to_s
|
|
31
|
+
"#<#{self.class.name}:#{accept_regexes.to_a.inspect}>"
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
private
|
|
35
|
+
|
|
36
|
+
def accept_regexes
|
|
37
|
+
@accept_regexes.to_a.sort_by(&:inspect)
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
end
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# Copyright Swipely, Inc. All rights reserved.
|
|
2
|
+
|
|
3
|
+
require 'delegate'
|
|
4
|
+
require 'digest/md5'
|
|
5
|
+
|
|
6
|
+
module Hasta
|
|
7
|
+
# An S3File delegate that drops filtered lines
|
|
8
|
+
class FilteredS3File < SimpleDelegator
|
|
9
|
+
def initialize(s3_file, filter)
|
|
10
|
+
super(s3_file)
|
|
11
|
+
@filter = filter
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def body
|
|
15
|
+
each_line.to_a.join
|
|
16
|
+
end
|
|
17
|
+
|
|
18
|
+
def fingerprint
|
|
19
|
+
@fingerprint ||= Digest::MD5.hexdigest("#{__getobj__.fingerprint}_#{filter.to_s}")
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
def each_line
|
|
23
|
+
return enum_for(:each_line) unless block_given?
|
|
24
|
+
|
|
25
|
+
__getobj__.each_line do |line|
|
|
26
|
+
yield line if filter.include?(line)
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
private
|
|
31
|
+
|
|
32
|
+
attr_reader :filter
|
|
33
|
+
end
|
|
34
|
+
end
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
# Copyright Swipely, Inc. All rights reserved.
|
|
2
|
+
|
|
3
|
+
require 'hasta/combined_data_source'
|
|
4
|
+
|
|
5
|
+
module Hasta
|
|
6
|
+
# Used by any EMR job that required an identity mapper
|
|
7
|
+
module IdentityMapper
|
|
8
|
+
def self.map(_, data_sources, data_sink = InMemoryDataSink.new)
|
|
9
|
+
Hasta.logger.debug "Starting Identity Mapper"
|
|
10
|
+
CombinedDataSource.new(data_sources).each_line do |line|
|
|
11
|
+
data_sink << line.rstrip
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
data_sink.close.tap { Hasta.logger.debug "Finished Identity Mapper" }
|
|
15
|
+
end
|
|
16
|
+
end
|
|
17
|
+
end
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
# Copyright Swipely, Inc. All rights reserved.
|
|
2
|
+
|
|
3
|
+
require 'hasta/in_memory_data_sink'
|
|
4
|
+
require 'hasta/sorted_data_source'
|
|
5
|
+
|
|
6
|
+
module Hasta
|
|
7
|
+
# Used by any EMR job that requires an identity reducer
|
|
8
|
+
module IdentityReducer
|
|
9
|
+
def self.reduce(_, data_source, data_sink = InMemoryDataSink.new)
|
|
10
|
+
Hasta.logger.debug "Starting Identity Reducer"
|
|
11
|
+
SortedDataSource.new(data_source).each_line do |line|
|
|
12
|
+
data_sink << line.rstrip
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
data_sink.close.tap { Hasta.logger.debug "Finished Identity Reducer" }
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
end
|