hasta 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.cane +1 -0
- data/.gitignore +3 -0
- data/.rspec +2 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +20 -0
- data/README.md +150 -0
- data/Rakefile +15 -0
- data/hasta.gemspec +29 -0
- data/lib/hasta.rb +46 -0
- data/lib/hasta/cached_s3_file.rb +21 -0
- data/lib/hasta/combined_data_source.rb +35 -0
- data/lib/hasta/combined_storage.rb +30 -0
- data/lib/hasta/configuration.rb +88 -0
- data/lib/hasta/emr_job_definition.rb +104 -0
- data/lib/hasta/emr_node.rb +103 -0
- data/lib/hasta/env.rb +35 -0
- data/lib/hasta/execution_context.rb +90 -0
- data/lib/hasta/filter.rb +40 -0
- data/lib/hasta/filtered_s3_file.rb +34 -0
- data/lib/hasta/identity_mapper.rb +17 -0
- data/lib/hasta/identity_reducer.rb +18 -0
- data/lib/hasta/in_memory_data_sink.rb +40 -0
- data/lib/hasta/in_memory_data_source.rb +35 -0
- data/lib/hasta/interpolate_string.rb +45 -0
- data/lib/hasta/local_file_path.rb +12 -0
- data/lib/hasta/local_storage.rb +41 -0
- data/lib/hasta/mapper.rb +23 -0
- data/lib/hasta/reducer.rb +29 -0
- data/lib/hasta/resolve_cached_s3_file.rb +29 -0
- data/lib/hasta/resolve_filtered_s3_file.rb +22 -0
- data/lib/hasta/runner.rb +32 -0
- data/lib/hasta/s3_data_sink.rb +48 -0
- data/lib/hasta/s3_data_source.rb +41 -0
- data/lib/hasta/s3_file.rb +56 -0
- data/lib/hasta/s3_file_cache.rb +23 -0
- data/lib/hasta/s3_storage.rb +21 -0
- data/lib/hasta/s3_uri.rb +60 -0
- data/lib/hasta/sorted_data_source.rb +36 -0
- data/lib/hasta/storage.rb +82 -0
- data/lib/hasta/tasks.rb +8 -0
- data/lib/hasta/tasks/runner.rb +84 -0
- data/lib/hasta/version.rb +3 -0
- data/spec/fixtures/hasta/filter_config.txt +1 -0
- data/spec/fixtures/hasta/json/emr_node.json +10 -0
- data/spec/fixtures/hasta/json/pipeline_definition.json +135 -0
- data/spec/fixtures/hasta/lib/failing_mapper.rb +19 -0
- data/spec/fixtures/hasta/lib/test_env_mapper.rb +20 -0
- data/spec/fixtures/hasta/lib/test_identity_mapper.rb +20 -0
- data/spec/fixtures/hasta/lib/test_types_mapper.rb +21 -0
- data/spec/fixtures/hasta/lib/types.rb +1 -0
- data/spec/fixtures/hasta/lib/unconventional_reducer.rb +17 -0
- data/spec/hasta/combined_data_source_spec.rb +25 -0
- data/spec/hasta/combined_storage_spec.rb +54 -0
- data/spec/hasta/configuration_spec.rb +49 -0
- data/spec/hasta/emr_job_definition_spec.rb +181 -0
- data/spec/hasta/emr_node_spec.rb +32 -0
- data/spec/hasta/env_spec.rb +30 -0
- data/spec/hasta/execution_context_spec.rb +67 -0
- data/spec/hasta/filter_spec.rb +66 -0
- data/spec/hasta/filtered_s3_file_spec.rb +45 -0
- data/spec/hasta/identity_mapper_spec.rb +22 -0
- data/spec/hasta/identity_reducer_spec.rb +20 -0
- data/spec/hasta/interpolate_string_spec.rb +44 -0
- data/spec/hasta/local_file_path_spec.rb +18 -0
- data/spec/hasta/local_storage_spec.rb +52 -0
- data/spec/hasta/mapper_spec.rb +26 -0
- data/spec/hasta/reducer_spec.rb +26 -0
- data/spec/hasta/resolved_cached_s3_file_spec.rb +68 -0
- data/spec/hasta/s3_data_source_spec.rb +39 -0
- data/spec/hasta/s3_file_cache_spec.rb +45 -0
- data/spec/hasta/s3_file_spec.rb +122 -0
- data/spec/hasta/s3_storage_spec.rb +24 -0
- data/spec/hasta/s3_uri_spec.rb +151 -0
- data/spec/hasta/sorted_data_source_spec.rb +22 -0
- data/spec/spec_helper.rb +24 -0
- data/spec/support/shared_contexts/hasta/local_fog_storage.rb +17 -0
- data/spec/support/shared_examples/hasta/storage_examples.rb +103 -0
- metadata +254 -0
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
# Copyright Swipely, Inc. All rights reserved.
|
|
2
|
+
|
|
3
|
+
module Hasta
|
|
4
|
+
# Caches data in a flat namespace using Fog storage
|
|
5
|
+
class S3FileCache
|
|
6
|
+
def initialize(fog_storage, bucket_name = 'cache')
|
|
7
|
+
directories = fog_storage.directories
|
|
8
|
+
@bucket = directories.get(bucket_name) || directories.create(:key => bucket_name)
|
|
9
|
+
end
|
|
10
|
+
|
|
11
|
+
def get(key)
|
|
12
|
+
bucket.files.get(key)
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def put(key, data)
|
|
16
|
+
bucket.files.create(:key => key, :body => data)
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
private
|
|
20
|
+
|
|
21
|
+
attr_reader :bucket
|
|
22
|
+
end
|
|
23
|
+
end
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
# Copyright Swipely, Inc. All rights reserved.
|
|
2
|
+
|
|
3
|
+
require 'fog'
|
|
4
|
+
|
|
5
|
+
require 'hasta/s3_uri'
|
|
6
|
+
require 'hasta/storage'
|
|
7
|
+
|
|
8
|
+
module Hasta
|
|
9
|
+
# The read-only file storage interface to the actual S3 data used by the local map/reduce jobs
|
|
10
|
+
class S3Storage
|
|
11
|
+
include Storage
|
|
12
|
+
|
|
13
|
+
private
|
|
14
|
+
|
|
15
|
+
def fog_files(s3_bucket, s3_uri)
|
|
16
|
+
s3_bucket.files.all('prefix' => s3_uri.path).reject { |file|
|
|
17
|
+
file.key == s3_uri.path || (file_s3_uri(file).parent != s3_uri)
|
|
18
|
+
}
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
end
|
data/lib/hasta/s3_uri.rb
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
# Copyright Swipely, Inc. All rights reserved.
|
|
2
|
+
|
|
3
|
+
module Hasta
|
|
4
|
+
# Represents a URI to a file or directory on S3
|
|
5
|
+
class S3URI
|
|
6
|
+
attr_reader :bucket, :path
|
|
7
|
+
|
|
8
|
+
def self.parse(uri)
|
|
9
|
+
if match = /\As3n?:\/\/([^\/]+?)(\/.*)?\z/.match(uri)
|
|
10
|
+
canonical_path = match[2] && match[2][1..-1]
|
|
11
|
+
new(match[1], canonical_path)
|
|
12
|
+
else
|
|
13
|
+
raise ArgumentError, "Invalid S3 URI: #{uri}"
|
|
14
|
+
end
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
def initialize(bucket, path)
|
|
18
|
+
@bucket = bucket
|
|
19
|
+
@path = path
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
def directory?
|
|
23
|
+
path.end_with?('/')
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def file?
|
|
27
|
+
!directory?
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def basename
|
|
31
|
+
if path
|
|
32
|
+
path.split('/').last
|
|
33
|
+
else
|
|
34
|
+
''
|
|
35
|
+
end
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
def parent
|
|
39
|
+
if path.nil?
|
|
40
|
+
nil
|
|
41
|
+
else
|
|
42
|
+
elements = path.split('/')
|
|
43
|
+
self.class.new(bucket, "#{elements.take(elements.length - 1).join('/')}/")
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
def append(append_path)
|
|
48
|
+
raise ArgumentError, "Cannot append to a file path: #{self}" if file?
|
|
49
|
+
self.class.new(bucket, File.join(path, append_path))
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
def ==(other)
|
|
53
|
+
self.class === other && (self.bucket == other.bucket && self.path == other.path)
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
def to_s
|
|
57
|
+
["s3:/", bucket, path].compact.join('/')
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
end
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
# Copyright Swipely, Inc. All rights reserved.
|
|
2
|
+
|
|
3
|
+
module Hasta
|
|
4
|
+
# Decorator for a data source that yields the contents in sorted order
|
|
5
|
+
class SortedDataSource
|
|
6
|
+
def initialize(data_source)
|
|
7
|
+
@data_source = data_source
|
|
8
|
+
end
|
|
9
|
+
|
|
10
|
+
def name
|
|
11
|
+
data_source.name
|
|
12
|
+
end
|
|
13
|
+
|
|
14
|
+
def each_line
|
|
15
|
+
return enum_for(:each_line) unless block_given?
|
|
16
|
+
|
|
17
|
+
sorted_lines.each do |line|
|
|
18
|
+
yield line
|
|
19
|
+
end
|
|
20
|
+
end
|
|
21
|
+
|
|
22
|
+
def to_s
|
|
23
|
+
"#<#{self.class.name}:#{name} size=#{lines.count} lines>"
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
private
|
|
27
|
+
|
|
28
|
+
attr_reader :data_source
|
|
29
|
+
|
|
30
|
+
def sorted_lines
|
|
31
|
+
data_source.to_a.sort.tap do
|
|
32
|
+
Hasta.logger.debug "Finished sorting data for source: #{data_source}"
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
end
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
# Copyright Swipely, Inc. All rights reserved.
|
|
2
|
+
|
|
3
|
+
require 'hasta/s3_file'
|
|
4
|
+
|
|
5
|
+
module Hasta
|
|
6
|
+
# Common file storage methods used by the local and S3 storage providers
|
|
7
|
+
module Storage
|
|
8
|
+
# Creates the appropriate Hasta S3 file instance given a Fog file
|
|
9
|
+
module ResolveS3File
|
|
10
|
+
def self.resolve(fog_file)
|
|
11
|
+
S3File.wrap(fog_file)
|
|
12
|
+
end
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
def initialize(fog_storage, s3_file_resolver = ResolveS3File)
|
|
16
|
+
@fog_storage = fog_storage
|
|
17
|
+
@s3_file_resolver = s3_file_resolver
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
def exists?(s3_uri)
|
|
21
|
+
if s3_uri.file?
|
|
22
|
+
!!fog_file(s3_uri)
|
|
23
|
+
elsif s3_bucket = bucket(s3_uri)
|
|
24
|
+
!fog_files(s3_bucket, s3_uri).empty?
|
|
25
|
+
end
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def files_for(s3_uri)
|
|
29
|
+
if s3_uri.file?
|
|
30
|
+
[s3_file!(s3_uri)]
|
|
31
|
+
else
|
|
32
|
+
s3_files(bucket!(s3_uri), s3_uri)
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
private
|
|
37
|
+
|
|
38
|
+
attr_reader :fog_storage, :s3_file_resolver
|
|
39
|
+
|
|
40
|
+
def bucket(s3_uri)
|
|
41
|
+
fog_storage.directories.get(s3_uri.bucket)
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
def s3_file!(s3_uri)
|
|
45
|
+
s3_file_resolver.resolve(fog_file!(s3_uri))
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
def s3_files(bucket, s3_uri)
|
|
49
|
+
fog_files(bucket, s3_uri).map { |fog_file| s3_file_resolver.resolve(fog_file) }
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
def fog_file(s3_uri)
|
|
53
|
+
(s3_bucket = bucket(s3_uri)) && s3_bucket.files.get(s3_uri.path)
|
|
54
|
+
end
|
|
55
|
+
|
|
56
|
+
def bucket!(s3_uri)
|
|
57
|
+
bang!(s3_uri) { bucket(s3_uri) }
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
def fog_file!(s3_uri)
|
|
61
|
+
bang!(s3_uri) { fog_file(s3_uri) }
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
def create_bucket(bucket_name)
|
|
65
|
+
fog_storage.directories.create(:key => bucket_name)
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
def create_file(bucket, key, content)
|
|
69
|
+
bucket.files.create(:body => content, :key => key)
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
def file_s3_uri(file, path=file.key)
|
|
73
|
+
S3URI.new(file.directory.key, path)
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
def bang!(s3_uri)
|
|
77
|
+
yield.tap do |result|
|
|
78
|
+
raise NonExistentPath.new(s3_uri) unless result
|
|
79
|
+
end
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
end
|
data/lib/hasta/tasks.rb
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
# Copyright Swipely, Inc. All rights reserved.
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
require 'rake'
|
|
5
|
+
require 'rake/tasklib'
|
|
6
|
+
require 'hasta'
|
|
7
|
+
|
|
8
|
+
require 'hasta/emr_job_definition'
|
|
9
|
+
require 'hasta/runner'
|
|
10
|
+
|
|
11
|
+
module Hasta
|
|
12
|
+
module Tasks
|
|
13
|
+
# Rakes task that runs a local test of an EMR job
|
|
14
|
+
class Runner < ::Rake::TaskLib
|
|
15
|
+
include ::Rake::DSL if defined?(::Rake::DSL)
|
|
16
|
+
|
|
17
|
+
# Name of task.
|
|
18
|
+
#
|
|
19
|
+
# default:
|
|
20
|
+
# :runner
|
|
21
|
+
attr_accessor :name
|
|
22
|
+
|
|
23
|
+
# Path to the AWS Data Pipeline definition file
|
|
24
|
+
attr_accessor :definition_file
|
|
25
|
+
|
|
26
|
+
# The Scheduled Start Time to use when evaluating the definition
|
|
27
|
+
#
|
|
28
|
+
# default:
|
|
29
|
+
# Time.now
|
|
30
|
+
attr_accessor :scheduled_start_time
|
|
31
|
+
|
|
32
|
+
# The id of the EMR job to perform
|
|
33
|
+
attr_accessor :job_id
|
|
34
|
+
|
|
35
|
+
# The root directory of the project containing the EMR code that is being tested
|
|
36
|
+
attr_accessor :project_root
|
|
37
|
+
|
|
38
|
+
# Use verbose output. If this is set to true, the task will print the
|
|
39
|
+
# local and remote paths of each step file it uploads to S3.
|
|
40
|
+
#
|
|
41
|
+
# default:
|
|
42
|
+
# true
|
|
43
|
+
attr_accessor :verbose
|
|
44
|
+
|
|
45
|
+
def initialize(*args, &task_block)
|
|
46
|
+
setup_ivars(args)
|
|
47
|
+
|
|
48
|
+
desc "Runs the specified EMR job"
|
|
49
|
+
task name, [:job_id, :scheduled_start_time] do |_, task_args|
|
|
50
|
+
RakeFileUtils.send(:verbose, verbose) do
|
|
51
|
+
if task_block
|
|
52
|
+
task_block.call(*[self, task_args].slice(0, task_block.arity))
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
run_task verbose
|
|
56
|
+
end
|
|
57
|
+
end
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
def setup_ivars(args)
|
|
61
|
+
@name = args.shift || :runner
|
|
62
|
+
@verbose = true
|
|
63
|
+
@path = "definitions"
|
|
64
|
+
@scheduled_start_time = Time.now
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
def run_task(verbose)
|
|
68
|
+
Hasta.configure do |config|
|
|
69
|
+
config.project_root = project_root
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
definition = Hasta::EmrJobDefinition.load(definition_file, job_id, scheduled_start_time)
|
|
73
|
+
runner = Hasta::Runner.new(definition.id, definition.mapper, definition.reducer)
|
|
74
|
+
|
|
75
|
+
result = runner.run(
|
|
76
|
+
definition.data_sources,
|
|
77
|
+
definition.data_sink,
|
|
78
|
+
definition.ruby_files,
|
|
79
|
+
definition.env
|
|
80
|
+
)
|
|
81
|
+
end
|
|
82
|
+
end
|
|
83
|
+
end
|
|
84
|
+
end
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
\AOnly Allow This Line\z
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
{
|
|
2
|
+
"id": "EMRJob1",
|
|
3
|
+
"type": "EmrActivity",
|
|
4
|
+
"onFail": { "ref": "FailureNotify" },
|
|
5
|
+
"schedule": { "ref": "Nightly" },
|
|
6
|
+
"runsOn": { "ref": "MenuIntelEMRCluster" },
|
|
7
|
+
"step": "/home/hadoop/contrib/streaming/hadoop-streaming.jar,-input,s3n://data-bucket/path/to/data/#{format(@scheduledStartTime,'YYYY-MM-dd_HHmmss')}/input1/,-output,s3://data-bucket/path/to/data/#{format(@scheduledStartTime,'YYYY-MM-dd_HHmmss')}/output/,-mapper,cat,-reducer,s3n://steps-bucket/path/to/reducer.rb,-cacheFile,s3://data-bucket/path/to/mappings.yml#mappings.yml,-cacheFile,s3://data-bucket/path/to/ignored.yml#ignored.yml,-cmdenv,API_KEY=123456,-cmdenv,ENVIRONMENT_NAME=uat",
|
|
8
|
+
"input": [ { "ref": "S3Input1" }, { "ref": "S3Input2" }, { "ref": "S3Input3" } ],
|
|
9
|
+
"output": { "ref": "S3Output" }
|
|
10
|
+
}
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
{
|
|
2
|
+
"objects": [
|
|
3
|
+
{
|
|
4
|
+
"id": "Default",
|
|
5
|
+
"role": "role-aggregator",
|
|
6
|
+
"resourceRole": "role-aggregator",
|
|
7
|
+
"failureAndRerunMode": "cascade",
|
|
8
|
+
"scheduleType": "cron",
|
|
9
|
+
"onFail": { "ref": "FailureNotify" }
|
|
10
|
+
},
|
|
11
|
+
|
|
12
|
+
{
|
|
13
|
+
"id": "Nightly",
|
|
14
|
+
"type": "Schedule",
|
|
15
|
+
"startDateTime": "2014-03-26T18:00:00",
|
|
16
|
+
"period": "12 hours"
|
|
17
|
+
},
|
|
18
|
+
|
|
19
|
+
{
|
|
20
|
+
"id": "SuccessNotify",
|
|
21
|
+
"type": "SnsAlarm",
|
|
22
|
+
"topicArn": "arn:aws:sns:us-east-1:999999999999:datapipeline-name",
|
|
23
|
+
"subject": "SUCCESS: pipeline step #{node.name}",
|
|
24
|
+
"message": "pipeline step SUCCESS\n\nScheduled start: #{node.@scheduledStartTime}\nActual start: #{node.@actualStartTime}\nActual end:\n#{node.@actualEndTime}"
|
|
25
|
+
},
|
|
26
|
+
|
|
27
|
+
{
|
|
28
|
+
"id": "FailureNotify",
|
|
29
|
+
"type": "SnsAlarm",
|
|
30
|
+
"topicArn": "arn:aws:sns:us-east-1:999999999999:datapipeline-name",
|
|
31
|
+
"subject": "FAILURE: pipeline step #{node.name}",
|
|
32
|
+
"message": "pipeline step FAILED #{node.name}\n\nScheduled start: #{node.@scheduledStartTime}\nError message:\n#{node.errorMessage}\nError stack trace:\n#{node.errorStackTrace}"
|
|
33
|
+
},
|
|
34
|
+
|
|
35
|
+
{
|
|
36
|
+
"id": "S3FirstInput",
|
|
37
|
+
"type": "S3DataNode",
|
|
38
|
+
"schedule": { "ref": "Nightly" },
|
|
39
|
+
"directoryPath": "s3://data-bucket/path/to/data/dir1/"
|
|
40
|
+
},
|
|
41
|
+
|
|
42
|
+
{
|
|
43
|
+
"id": "S3SecondInput",
|
|
44
|
+
"type": "S3DataNode",
|
|
45
|
+
"schedule": { "ref": "Nightly" },
|
|
46
|
+
"directoryPath": "s3://data-bucket/path/to/data/dir2/"
|
|
47
|
+
},
|
|
48
|
+
|
|
49
|
+
{
|
|
50
|
+
"id": "S3ThirdInput",
|
|
51
|
+
"type": "S3DataNode",
|
|
52
|
+
"schedule": { "ref": "Nightly" },
|
|
53
|
+
"filePath": "s3://datapipeline-assets/path/to/data/file.csv"
|
|
54
|
+
},
|
|
55
|
+
|
|
56
|
+
{
|
|
57
|
+
"id": "S3Output",
|
|
58
|
+
"type": "S3DataNode",
|
|
59
|
+
"schedule": { "ref": "Nightly" },
|
|
60
|
+
"directoryPath": "s3://data-bucket/path/to_data/results/"
|
|
61
|
+
},
|
|
62
|
+
|
|
63
|
+
{
|
|
64
|
+
"id": "TestPipelineEC2Resource",
|
|
65
|
+
"type": "Ec2Resource",
|
|
66
|
+
"instanceType": "m1.large",
|
|
67
|
+
"schedule": { "ref": "Nightly" },
|
|
68
|
+
"logUri": "s3://logs-bucket/path/to/test_pipeline/logs/TestPipelineEC2Resource",
|
|
69
|
+
"terminateAfter": "6 hours",
|
|
70
|
+
"keyPair": "pipeline-debug",
|
|
71
|
+
"securityGroups": ["s-pipe-appdb"]
|
|
72
|
+
},
|
|
73
|
+
|
|
74
|
+
{
|
|
75
|
+
"id": "TestPipelineEMRCluster",
|
|
76
|
+
"type": "EmrCluster",
|
|
77
|
+
"masterInstanceType": "m1.large",
|
|
78
|
+
"taskInstanceType": "m1.large",
|
|
79
|
+
"coreInstanceType": "m1.large",
|
|
80
|
+
"coreInstanceCount": "2",
|
|
81
|
+
"terminateAfter": "1 hour",
|
|
82
|
+
"schedule": { "ref": "Nightly" },
|
|
83
|
+
"enableDebugging": "true",
|
|
84
|
+
"bootstrapAction": "s3://steps-bucket/test_pipeline/steps/bootstrap_emr.sh",
|
|
85
|
+
"emrLogUri": "s3://logs-bucket/path/to/test_pipeline/logs/RecentSalesPipelineEMRLogs",
|
|
86
|
+
"logUri": "s3://logs-bucket/path/to/test_pipeline/logs/TestPipelineEMRCluster"
|
|
87
|
+
},
|
|
88
|
+
|
|
89
|
+
{
|
|
90
|
+
"id": "BootstrapEnvironment",
|
|
91
|
+
"type": "ShellCommandActivity",
|
|
92
|
+
"onFail": { "ref": "FailureNotify" },
|
|
93
|
+
"stdout": "s3://logs-bucket/path/to/test_pipeline/logs/BootstrapEnvironment/stdout",
|
|
94
|
+
"stderr": "s3://logs-bucket/path/to/test_pipeline/logs/BootstrapEnvironment/stderr",
|
|
95
|
+
"schedule": { "ref": "Nightly" },
|
|
96
|
+
"runsOn": { "ref": "TestPipelineEC2Resource" },
|
|
97
|
+
"scriptUri": "s3://steps-bucket/test_pipeline/steps/bootstrap_ec2.sh"
|
|
98
|
+
},
|
|
99
|
+
|
|
100
|
+
{
|
|
101
|
+
"id": "TouchEMRCluster",
|
|
102
|
+
"type": "ShellCommandActivity",
|
|
103
|
+
"onFail": { "ref": "FailureNotify" },
|
|
104
|
+
"schedule": { "ref": "Nightly" },
|
|
105
|
+
"runsOn": { "ref": "TestPipelineEMRCluster" },
|
|
106
|
+
"command": "true"
|
|
107
|
+
},
|
|
108
|
+
|
|
109
|
+
{
|
|
110
|
+
"id": "EMRJob1",
|
|
111
|
+
"type": "EmrActivity",
|
|
112
|
+
"onFail": { "ref": "FailureNotify" },
|
|
113
|
+
"schedule": { "ref": "Nightly" },
|
|
114
|
+
"runsOn": { "ref": "TestPipelineEMRCluster" },
|
|
115
|
+
"step": "/home/hadoop/contrib/streaming/hadoop-streaming.jar,-input,s3://data-bucket/path/to/data/dir1/,-input,s3://data-bucket/path/to/data/dir2/,-input,s3://data-bucket/path/to/data/file.csv,-output,s3://data-bucket/path/to_data/results/,-mapper,s3n://steps-bucket/test_pipeline/steps/mapper.rb,-reducer,s3n://steps-bucket/test_pipeline/steps/reducer.rb,-cacheFile,s3n://steps-bucket/test_pipeline/steps/types.rb#types.rb,-cacheFile,s3://data-bucket/path/to/data/notes.yml#notes.yml,-cmdenv,API_KEY=123456",
|
|
116
|
+
"input": [ { "ref": "S3FirstInput" }, { "ref": "S3SecondInput" }, { "ref": "S3ThirdInput" } ],
|
|
117
|
+
"output": { "ref": "S3Output" }
|
|
118
|
+
},
|
|
119
|
+
|
|
120
|
+
{
|
|
121
|
+
"id": "Complete",
|
|
122
|
+
"type": "ShellCommandActivity",
|
|
123
|
+
"onFail": { "ref": "FailureNotify" },
|
|
124
|
+
"schedule": { "ref": "Nightly" },
|
|
125
|
+
"onSuccess": { "ref": "SuccessNotify" },
|
|
126
|
+
"runsOn": { "ref": "TestPipelineEC2Resource" },
|
|
127
|
+
"dependsOn": [
|
|
128
|
+
{ "ref": "TouchEMRCluster" },
|
|
129
|
+
{ "ref": "EMRJob1" }
|
|
130
|
+
],
|
|
131
|
+
"command": "true"
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
]
|
|
135
|
+
}
|