hasta 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. checksums.yaml +7 -0
  2. data/.cane +1 -0
  3. data/.gitignore +3 -0
  4. data/.rspec +2 -0
  5. data/Gemfile +4 -0
  6. data/LICENSE.txt +20 -0
  7. data/README.md +150 -0
  8. data/Rakefile +15 -0
  9. data/hasta.gemspec +29 -0
  10. data/lib/hasta.rb +46 -0
  11. data/lib/hasta/cached_s3_file.rb +21 -0
  12. data/lib/hasta/combined_data_source.rb +35 -0
  13. data/lib/hasta/combined_storage.rb +30 -0
  14. data/lib/hasta/configuration.rb +88 -0
  15. data/lib/hasta/emr_job_definition.rb +104 -0
  16. data/lib/hasta/emr_node.rb +103 -0
  17. data/lib/hasta/env.rb +35 -0
  18. data/lib/hasta/execution_context.rb +90 -0
  19. data/lib/hasta/filter.rb +40 -0
  20. data/lib/hasta/filtered_s3_file.rb +34 -0
  21. data/lib/hasta/identity_mapper.rb +17 -0
  22. data/lib/hasta/identity_reducer.rb +18 -0
  23. data/lib/hasta/in_memory_data_sink.rb +40 -0
  24. data/lib/hasta/in_memory_data_source.rb +35 -0
  25. data/lib/hasta/interpolate_string.rb +45 -0
  26. data/lib/hasta/local_file_path.rb +12 -0
  27. data/lib/hasta/local_storage.rb +41 -0
  28. data/lib/hasta/mapper.rb +23 -0
  29. data/lib/hasta/reducer.rb +29 -0
  30. data/lib/hasta/resolve_cached_s3_file.rb +29 -0
  31. data/lib/hasta/resolve_filtered_s3_file.rb +22 -0
  32. data/lib/hasta/runner.rb +32 -0
  33. data/lib/hasta/s3_data_sink.rb +48 -0
  34. data/lib/hasta/s3_data_source.rb +41 -0
  35. data/lib/hasta/s3_file.rb +56 -0
  36. data/lib/hasta/s3_file_cache.rb +23 -0
  37. data/lib/hasta/s3_storage.rb +21 -0
  38. data/lib/hasta/s3_uri.rb +60 -0
  39. data/lib/hasta/sorted_data_source.rb +36 -0
  40. data/lib/hasta/storage.rb +82 -0
  41. data/lib/hasta/tasks.rb +8 -0
  42. data/lib/hasta/tasks/runner.rb +84 -0
  43. data/lib/hasta/version.rb +3 -0
  44. data/spec/fixtures/hasta/filter_config.txt +1 -0
  45. data/spec/fixtures/hasta/json/emr_node.json +10 -0
  46. data/spec/fixtures/hasta/json/pipeline_definition.json +135 -0
  47. data/spec/fixtures/hasta/lib/failing_mapper.rb +19 -0
  48. data/spec/fixtures/hasta/lib/test_env_mapper.rb +20 -0
  49. data/spec/fixtures/hasta/lib/test_identity_mapper.rb +20 -0
  50. data/spec/fixtures/hasta/lib/test_types_mapper.rb +21 -0
  51. data/spec/fixtures/hasta/lib/types.rb +1 -0
  52. data/spec/fixtures/hasta/lib/unconventional_reducer.rb +17 -0
  53. data/spec/hasta/combined_data_source_spec.rb +25 -0
  54. data/spec/hasta/combined_storage_spec.rb +54 -0
  55. data/spec/hasta/configuration_spec.rb +49 -0
  56. data/spec/hasta/emr_job_definition_spec.rb +181 -0
  57. data/spec/hasta/emr_node_spec.rb +32 -0
  58. data/spec/hasta/env_spec.rb +30 -0
  59. data/spec/hasta/execution_context_spec.rb +67 -0
  60. data/spec/hasta/filter_spec.rb +66 -0
  61. data/spec/hasta/filtered_s3_file_spec.rb +45 -0
  62. data/spec/hasta/identity_mapper_spec.rb +22 -0
  63. data/spec/hasta/identity_reducer_spec.rb +20 -0
  64. data/spec/hasta/interpolate_string_spec.rb +44 -0
  65. data/spec/hasta/local_file_path_spec.rb +18 -0
  66. data/spec/hasta/local_storage_spec.rb +52 -0
  67. data/spec/hasta/mapper_spec.rb +26 -0
  68. data/spec/hasta/reducer_spec.rb +26 -0
  69. data/spec/hasta/resolved_cached_s3_file_spec.rb +68 -0
  70. data/spec/hasta/s3_data_source_spec.rb +39 -0
  71. data/spec/hasta/s3_file_cache_spec.rb +45 -0
  72. data/spec/hasta/s3_file_spec.rb +122 -0
  73. data/spec/hasta/s3_storage_spec.rb +24 -0
  74. data/spec/hasta/s3_uri_spec.rb +151 -0
  75. data/spec/hasta/sorted_data_source_spec.rb +22 -0
  76. data/spec/spec_helper.rb +24 -0
  77. data/spec/support/shared_contexts/hasta/local_fog_storage.rb +17 -0
  78. data/spec/support/shared_examples/hasta/storage_examples.rb +103 -0
  79. metadata +254 -0
@@ -0,0 +1,23 @@
1
+ # Copyright Swipely, Inc. All rights reserved.
2
+
3
+ module Hasta
4
+ # Caches data in a flat namespace using Fog storage
5
+ class S3FileCache
6
+ def initialize(fog_storage, bucket_name = 'cache')
7
+ directories = fog_storage.directories
8
+ @bucket = directories.get(bucket_name) || directories.create(:key => bucket_name)
9
+ end
10
+
11
+ def get(key)
12
+ bucket.files.get(key)
13
+ end
14
+
15
+ def put(key, data)
16
+ bucket.files.create(:key => key, :body => data)
17
+ end
18
+
19
+ private
20
+
21
+ attr_reader :bucket
22
+ end
23
+ end
@@ -0,0 +1,21 @@
1
+ # Copyright Swipely, Inc. All rights reserved.
2
+
3
+ require 'fog'
4
+
5
+ require 'hasta/s3_uri'
6
+ require 'hasta/storage'
7
+
8
+ module Hasta
9
+ # The read-only file storage interface to the actual S3 data used by the local map/reduce jobs
10
+ class S3Storage
11
+ include Storage
12
+
13
+ private
14
+
15
+ def fog_files(s3_bucket, s3_uri)
16
+ s3_bucket.files.all('prefix' => s3_uri.path).reject { |file|
17
+ file.key == s3_uri.path || (file_s3_uri(file).parent != s3_uri)
18
+ }
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,60 @@
1
+ # Copyright Swipely, Inc. All rights reserved.
2
+
3
+ module Hasta
4
+ # Represents a URI to a file or directory on S3
5
+ class S3URI
6
+ attr_reader :bucket, :path
7
+
8
+ def self.parse(uri)
9
+ if match = /\As3n?:\/\/([^\/]+?)(\/.*)?\z/.match(uri)
10
+ canonical_path = match[2] && match[2][1..-1]
11
+ new(match[1], canonical_path)
12
+ else
13
+ raise ArgumentError, "Invalid S3 URI: #{uri}"
14
+ end
15
+ end
16
+
17
+ def initialize(bucket, path)
18
+ @bucket = bucket
19
+ @path = path
20
+ end
21
+
22
+ def directory?
23
+ path.end_with?('/')
24
+ end
25
+
26
+ def file?
27
+ !directory?
28
+ end
29
+
30
+ def basename
31
+ if path
32
+ path.split('/').last
33
+ else
34
+ ''
35
+ end
36
+ end
37
+
38
+ def parent
39
+ if path.nil?
40
+ nil
41
+ else
42
+ elements = path.split('/')
43
+ self.class.new(bucket, "#{elements.take(elements.length - 1).join('/')}/")
44
+ end
45
+ end
46
+
47
+ def append(append_path)
48
+ raise ArgumentError, "Cannot append to a file path: #{self}" if file?
49
+ self.class.new(bucket, File.join(path, append_path))
50
+ end
51
+
52
+ def ==(other)
53
+ self.class === other && (self.bucket == other.bucket && self.path == other.path)
54
+ end
55
+
56
+ def to_s
57
+ ["s3:/", bucket, path].compact.join('/')
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,36 @@
1
+ # Copyright Swipely, Inc. All rights reserved.
2
+
3
+ module Hasta
4
+ # Decorator for a data source that yields the contents in sorted order
5
+ class SortedDataSource
6
+ def initialize(data_source)
7
+ @data_source = data_source
8
+ end
9
+
10
+ def name
11
+ data_source.name
12
+ end
13
+
14
+ def each_line
15
+ return enum_for(:each_line) unless block_given?
16
+
17
+ sorted_lines.each do |line|
18
+ yield line
19
+ end
20
+ end
21
+
22
+ def to_s
23
+ "#<#{self.class.name}:#{name} size=#{lines.count} lines>"
24
+ end
25
+
26
+ private
27
+
28
+ attr_reader :data_source
29
+
30
+ def sorted_lines
31
+ data_source.to_a.sort.tap do
32
+ Hasta.logger.debug "Finished sorting data for source: #{data_source}"
33
+ end
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,82 @@
1
+ # Copyright Swipely, Inc. All rights reserved.
2
+
3
+ require 'hasta/s3_file'
4
+
5
+ module Hasta
6
+ # Common file storage methods used by the local and S3 storage providers
7
+ module Storage
8
+ # Creates the appropriate Hasta S3 file instance given a Fog file
9
+ module ResolveS3File
10
+ def self.resolve(fog_file)
11
+ S3File.wrap(fog_file)
12
+ end
13
+ end
14
+
15
+ def initialize(fog_storage, s3_file_resolver = ResolveS3File)
16
+ @fog_storage = fog_storage
17
+ @s3_file_resolver = s3_file_resolver
18
+ end
19
+
20
+ def exists?(s3_uri)
21
+ if s3_uri.file?
22
+ !!fog_file(s3_uri)
23
+ elsif s3_bucket = bucket(s3_uri)
24
+ !fog_files(s3_bucket, s3_uri).empty?
25
+ end
26
+ end
27
+
28
+ def files_for(s3_uri)
29
+ if s3_uri.file?
30
+ [s3_file!(s3_uri)]
31
+ else
32
+ s3_files(bucket!(s3_uri), s3_uri)
33
+ end
34
+ end
35
+
36
+ private
37
+
38
+ attr_reader :fog_storage, :s3_file_resolver
39
+
40
+ def bucket(s3_uri)
41
+ fog_storage.directories.get(s3_uri.bucket)
42
+ end
43
+
44
+ def s3_file!(s3_uri)
45
+ s3_file_resolver.resolve(fog_file!(s3_uri))
46
+ end
47
+
48
+ def s3_files(bucket, s3_uri)
49
+ fog_files(bucket, s3_uri).map { |fog_file| s3_file_resolver.resolve(fog_file) }
50
+ end
51
+
52
+ def fog_file(s3_uri)
53
+ (s3_bucket = bucket(s3_uri)) && s3_bucket.files.get(s3_uri.path)
54
+ end
55
+
56
+ def bucket!(s3_uri)
57
+ bang!(s3_uri) { bucket(s3_uri) }
58
+ end
59
+
60
+ def fog_file!(s3_uri)
61
+ bang!(s3_uri) { fog_file(s3_uri) }
62
+ end
63
+
64
+ def create_bucket(bucket_name)
65
+ fog_storage.directories.create(:key => bucket_name)
66
+ end
67
+
68
+ def create_file(bucket, key, content)
69
+ bucket.files.create(:body => content, :key => key)
70
+ end
71
+
72
+ def file_s3_uri(file, path=file.key)
73
+ S3URI.new(file.directory.key, path)
74
+ end
75
+
76
+ def bang!(s3_uri)
77
+ yield.tap do |result|
78
+ raise NonExistentPath.new(s3_uri) unless result
79
+ end
80
+ end
81
+ end
82
+ end
@@ -0,0 +1,8 @@
1
+ # Copyright Swipely, Inc. All rights reserved.
2
+
3
+ require 'hasta/tasks/runner'
4
+
5
+ module Hasta
6
+ module Tasks
7
+ end
8
+ end
@@ -0,0 +1,84 @@
1
+ # Copyright Swipely, Inc. All rights reserved.
2
+
3
+
4
+ require 'rake'
5
+ require 'rake/tasklib'
6
+ require 'hasta'
7
+
8
+ require 'hasta/emr_job_definition'
9
+ require 'hasta/runner'
10
+
11
+ module Hasta
12
+ module Tasks
13
+ # Rakes task that runs a local test of an EMR job
14
+ class Runner < ::Rake::TaskLib
15
+ include ::Rake::DSL if defined?(::Rake::DSL)
16
+
17
+ # Name of task.
18
+ #
19
+ # default:
20
+ # :runner
21
+ attr_accessor :name
22
+
23
+ # Path to the AWS Data Pipeline definition file
24
+ attr_accessor :definition_file
25
+
26
+ # The Scheduled Start Time to use when evaluating the definition
27
+ #
28
+ # default:
29
+ # Time.now
30
+ attr_accessor :scheduled_start_time
31
+
32
+ # The id of the EMR job to perform
33
+ attr_accessor :job_id
34
+
35
+ # The root directory of the project containing the EMR code that is being tested
36
+ attr_accessor :project_root
37
+
38
+ # Use verbose output. If this is set to true, the task will print the
39
+ # local and remote paths of each step file it uploads to S3.
40
+ #
41
+ # default:
42
+ # true
43
+ attr_accessor :verbose
44
+
45
+ def initialize(*args, &task_block)
46
+ setup_ivars(args)
47
+
48
+ desc "Runs the specified EMR job"
49
+ task name, [:job_id, :scheduled_start_time] do |_, task_args|
50
+ RakeFileUtils.send(:verbose, verbose) do
51
+ if task_block
52
+ task_block.call(*[self, task_args].slice(0, task_block.arity))
53
+ end
54
+
55
+ run_task verbose
56
+ end
57
+ end
58
+ end
59
+
60
+ def setup_ivars(args)
61
+ @name = args.shift || :runner
62
+ @verbose = true
63
+ @path = "definitions"
64
+ @scheduled_start_time = Time.now
65
+ end
66
+
67
+ def run_task(verbose)
68
+ Hasta.configure do |config|
69
+ config.project_root = project_root
70
+ end
71
+
72
+ definition = Hasta::EmrJobDefinition.load(definition_file, job_id, scheduled_start_time)
73
+ runner = Hasta::Runner.new(definition.id, definition.mapper, definition.reducer)
74
+
75
+ result = runner.run(
76
+ definition.data_sources,
77
+ definition.data_sink,
78
+ definition.ruby_files,
79
+ definition.env
80
+ )
81
+ end
82
+ end
83
+ end
84
+ end
@@ -0,0 +1,3 @@
1
+ module Hasta
2
+ VERSION = "0.1.0"
3
+ end
@@ -0,0 +1 @@
1
+ \AOnly Allow This Line\z
@@ -0,0 +1,10 @@
1
+ {
2
+ "id": "EMRJob1",
3
+ "type": "EmrActivity",
4
+ "onFail": { "ref": "FailureNotify" },
5
+ "schedule": { "ref": "Nightly" },
6
+ "runsOn": { "ref": "MenuIntelEMRCluster" },
7
+ "step": "/home/hadoop/contrib/streaming/hadoop-streaming.jar,-input,s3n://data-bucket/path/to/data/#{format(@scheduledStartTime,'YYYY-MM-dd_HHmmss')}/input1/,-output,s3://data-bucket/path/to/data/#{format(@scheduledStartTime,'YYYY-MM-dd_HHmmss')}/output/,-mapper,cat,-reducer,s3n://steps-bucket/path/to/reducer.rb,-cacheFile,s3://data-bucket/path/to/mappings.yml#mappings.yml,-cacheFile,s3://data-bucket/path/to/ignored.yml#ignored.yml,-cmdenv,API_KEY=123456,-cmdenv,ENVIRONMENT_NAME=uat",
8
+ "input": [ { "ref": "S3Input1" }, { "ref": "S3Input2" }, { "ref": "S3Input3" } ],
9
+ "output": { "ref": "S3Output" }
10
+ }
@@ -0,0 +1,135 @@
1
+ {
2
+ "objects": [
3
+ {
4
+ "id": "Default",
5
+ "role": "role-aggregator",
6
+ "resourceRole": "role-aggregator",
7
+ "failureAndRerunMode": "cascade",
8
+ "scheduleType": "cron",
9
+ "onFail": { "ref": "FailureNotify" }
10
+ },
11
+
12
+ {
13
+ "id": "Nightly",
14
+ "type": "Schedule",
15
+ "startDateTime": "2014-03-26T18:00:00",
16
+ "period": "12 hours"
17
+ },
18
+
19
+ {
20
+ "id": "SuccessNotify",
21
+ "type": "SnsAlarm",
22
+ "topicArn": "arn:aws:sns:us-east-1:999999999999:datapipeline-name",
23
+ "subject": "SUCCESS: pipeline step #{node.name}",
24
+ "message": "pipeline step SUCCESS\n\nScheduled start: #{node.@scheduledStartTime}\nActual start: #{node.@actualStartTime}\nActual end:\n#{node.@actualEndTime}"
25
+ },
26
+
27
+ {
28
+ "id": "FailureNotify",
29
+ "type": "SnsAlarm",
30
+ "topicArn": "arn:aws:sns:us-east-1:999999999999:datapipeline-name",
31
+ "subject": "FAILURE: pipeline step #{node.name}",
32
+ "message": "pipeline step FAILED #{node.name}\n\nScheduled start: #{node.@scheduledStartTime}\nError message:\n#{node.errorMessage}\nError stack trace:\n#{node.errorStackTrace}"
33
+ },
34
+
35
+ {
36
+ "id": "S3FirstInput",
37
+ "type": "S3DataNode",
38
+ "schedule": { "ref": "Nightly" },
39
+ "directoryPath": "s3://data-bucket/path/to/data/dir1/"
40
+ },
41
+
42
+ {
43
+ "id": "S3SecondInput",
44
+ "type": "S3DataNode",
45
+ "schedule": { "ref": "Nightly" },
46
+ "directoryPath": "s3://data-bucket/path/to/data/dir2/"
47
+ },
48
+
49
+ {
50
+ "id": "S3ThirdInput",
51
+ "type": "S3DataNode",
52
+ "schedule": { "ref": "Nightly" },
53
+ "filePath": "s3://datapipeline-assets/path/to/data/file.csv"
54
+ },
55
+
56
+ {
57
+ "id": "S3Output",
58
+ "type": "S3DataNode",
59
+ "schedule": { "ref": "Nightly" },
60
+ "directoryPath": "s3://data-bucket/path/to_data/results/"
61
+ },
62
+
63
+ {
64
+ "id": "TestPipelineEC2Resource",
65
+ "type": "Ec2Resource",
66
+ "instanceType": "m1.large",
67
+ "schedule": { "ref": "Nightly" },
68
+ "logUri": "s3://logs-bucket/path/to/test_pipeline/logs/TestPipelineEC2Resource",
69
+ "terminateAfter": "6 hours",
70
+ "keyPair": "pipeline-debug",
71
+ "securityGroups": ["s-pipe-appdb"]
72
+ },
73
+
74
+ {
75
+ "id": "TestPipelineEMRCluster",
76
+ "type": "EmrCluster",
77
+ "masterInstanceType": "m1.large",
78
+ "taskInstanceType": "m1.large",
79
+ "coreInstanceType": "m1.large",
80
+ "coreInstanceCount": "2",
81
+ "terminateAfter": "1 hour",
82
+ "schedule": { "ref": "Nightly" },
83
+ "enableDebugging": "true",
84
+ "bootstrapAction": "s3://steps-bucket/test_pipeline/steps/bootstrap_emr.sh",
85
+ "emrLogUri": "s3://logs-bucket/path/to/test_pipeline/logs/RecentSalesPipelineEMRLogs",
86
+ "logUri": "s3://logs-bucket/path/to/test_pipeline/logs/TestPipelineEMRCluster"
87
+ },
88
+
89
+ {
90
+ "id": "BootstrapEnvironment",
91
+ "type": "ShellCommandActivity",
92
+ "onFail": { "ref": "FailureNotify" },
93
+ "stdout": "s3://logs-bucket/path/to/test_pipeline/logs/BootstrapEnvironment/stdout",
94
+ "stderr": "s3://logs-bucket/path/to/test_pipeline/logs/BootstrapEnvironment/stderr",
95
+ "schedule": { "ref": "Nightly" },
96
+ "runsOn": { "ref": "TestPipelineEC2Resource" },
97
+ "scriptUri": "s3://steps-bucket/test_pipeline/steps/bootstrap_ec2.sh"
98
+ },
99
+
100
+ {
101
+ "id": "TouchEMRCluster",
102
+ "type": "ShellCommandActivity",
103
+ "onFail": { "ref": "FailureNotify" },
104
+ "schedule": { "ref": "Nightly" },
105
+ "runsOn": { "ref": "TestPipelineEMRCluster" },
106
+ "command": "true"
107
+ },
108
+
109
+ {
110
+ "id": "EMRJob1",
111
+ "type": "EmrActivity",
112
+ "onFail": { "ref": "FailureNotify" },
113
+ "schedule": { "ref": "Nightly" },
114
+ "runsOn": { "ref": "TestPipelineEMRCluster" },
115
+ "step": "/home/hadoop/contrib/streaming/hadoop-streaming.jar,-input,s3://data-bucket/path/to/data/dir1/,-input,s3://data-bucket/path/to/data/dir2/,-input,s3://data-bucket/path/to/data/file.csv,-output,s3://data-bucket/path/to_data/results/,-mapper,s3n://steps-bucket/test_pipeline/steps/mapper.rb,-reducer,s3n://steps-bucket/test_pipeline/steps/reducer.rb,-cacheFile,s3n://steps-bucket/test_pipeline/steps/types.rb#types.rb,-cacheFile,s3://data-bucket/path/to/data/notes.yml#notes.yml,-cmdenv,API_KEY=123456",
116
+ "input": [ { "ref": "S3FirstInput" }, { "ref": "S3SecondInput" }, { "ref": "S3ThirdInput" } ],
117
+ "output": { "ref": "S3Output" }
118
+ },
119
+
120
+ {
121
+ "id": "Complete",
122
+ "type": "ShellCommandActivity",
123
+ "onFail": { "ref": "FailureNotify" },
124
+ "schedule": { "ref": "Nightly" },
125
+ "onSuccess": { "ref": "SuccessNotify" },
126
+ "runsOn": { "ref": "TestPipelineEC2Resource" },
127
+ "dependsOn": [
128
+ { "ref": "TouchEMRCluster" },
129
+ { "ref": "EMRJob1" }
130
+ ],
131
+ "command": "true"
132
+ }
133
+
134
+ ]
135
+ }