hasta 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (79) hide show
  1. checksums.yaml +7 -0
  2. data/.cane +1 -0
  3. data/.gitignore +3 -0
  4. data/.rspec +2 -0
  5. data/Gemfile +4 -0
  6. data/LICENSE.txt +20 -0
  7. data/README.md +150 -0
  8. data/Rakefile +15 -0
  9. data/hasta.gemspec +29 -0
  10. data/lib/hasta.rb +46 -0
  11. data/lib/hasta/cached_s3_file.rb +21 -0
  12. data/lib/hasta/combined_data_source.rb +35 -0
  13. data/lib/hasta/combined_storage.rb +30 -0
  14. data/lib/hasta/configuration.rb +88 -0
  15. data/lib/hasta/emr_job_definition.rb +104 -0
  16. data/lib/hasta/emr_node.rb +103 -0
  17. data/lib/hasta/env.rb +35 -0
  18. data/lib/hasta/execution_context.rb +90 -0
  19. data/lib/hasta/filter.rb +40 -0
  20. data/lib/hasta/filtered_s3_file.rb +34 -0
  21. data/lib/hasta/identity_mapper.rb +17 -0
  22. data/lib/hasta/identity_reducer.rb +18 -0
  23. data/lib/hasta/in_memory_data_sink.rb +40 -0
  24. data/lib/hasta/in_memory_data_source.rb +35 -0
  25. data/lib/hasta/interpolate_string.rb +45 -0
  26. data/lib/hasta/local_file_path.rb +12 -0
  27. data/lib/hasta/local_storage.rb +41 -0
  28. data/lib/hasta/mapper.rb +23 -0
  29. data/lib/hasta/reducer.rb +29 -0
  30. data/lib/hasta/resolve_cached_s3_file.rb +29 -0
  31. data/lib/hasta/resolve_filtered_s3_file.rb +22 -0
  32. data/lib/hasta/runner.rb +32 -0
  33. data/lib/hasta/s3_data_sink.rb +48 -0
  34. data/lib/hasta/s3_data_source.rb +41 -0
  35. data/lib/hasta/s3_file.rb +56 -0
  36. data/lib/hasta/s3_file_cache.rb +23 -0
  37. data/lib/hasta/s3_storage.rb +21 -0
  38. data/lib/hasta/s3_uri.rb +60 -0
  39. data/lib/hasta/sorted_data_source.rb +36 -0
  40. data/lib/hasta/storage.rb +82 -0
  41. data/lib/hasta/tasks.rb +8 -0
  42. data/lib/hasta/tasks/runner.rb +84 -0
  43. data/lib/hasta/version.rb +3 -0
  44. data/spec/fixtures/hasta/filter_config.txt +1 -0
  45. data/spec/fixtures/hasta/json/emr_node.json +10 -0
  46. data/spec/fixtures/hasta/json/pipeline_definition.json +135 -0
  47. data/spec/fixtures/hasta/lib/failing_mapper.rb +19 -0
  48. data/spec/fixtures/hasta/lib/test_env_mapper.rb +20 -0
  49. data/spec/fixtures/hasta/lib/test_identity_mapper.rb +20 -0
  50. data/spec/fixtures/hasta/lib/test_types_mapper.rb +21 -0
  51. data/spec/fixtures/hasta/lib/types.rb +1 -0
  52. data/spec/fixtures/hasta/lib/unconventional_reducer.rb +17 -0
  53. data/spec/hasta/combined_data_source_spec.rb +25 -0
  54. data/spec/hasta/combined_storage_spec.rb +54 -0
  55. data/spec/hasta/configuration_spec.rb +49 -0
  56. data/spec/hasta/emr_job_definition_spec.rb +181 -0
  57. data/spec/hasta/emr_node_spec.rb +32 -0
  58. data/spec/hasta/env_spec.rb +30 -0
  59. data/spec/hasta/execution_context_spec.rb +67 -0
  60. data/spec/hasta/filter_spec.rb +66 -0
  61. data/spec/hasta/filtered_s3_file_spec.rb +45 -0
  62. data/spec/hasta/identity_mapper_spec.rb +22 -0
  63. data/spec/hasta/identity_reducer_spec.rb +20 -0
  64. data/spec/hasta/interpolate_string_spec.rb +44 -0
  65. data/spec/hasta/local_file_path_spec.rb +18 -0
  66. data/spec/hasta/local_storage_spec.rb +52 -0
  67. data/spec/hasta/mapper_spec.rb +26 -0
  68. data/spec/hasta/reducer_spec.rb +26 -0
  69. data/spec/hasta/resolved_cached_s3_file_spec.rb +68 -0
  70. data/spec/hasta/s3_data_source_spec.rb +39 -0
  71. data/spec/hasta/s3_file_cache_spec.rb +45 -0
  72. data/spec/hasta/s3_file_spec.rb +122 -0
  73. data/spec/hasta/s3_storage_spec.rb +24 -0
  74. data/spec/hasta/s3_uri_spec.rb +151 -0
  75. data/spec/hasta/sorted_data_source_spec.rb +22 -0
  76. data/spec/spec_helper.rb +24 -0
  77. data/spec/support/shared_contexts/hasta/local_fog_storage.rb +17 -0
  78. data/spec/support/shared_examples/hasta/storage_examples.rb +103 -0
  79. metadata +254 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: adeafa9f4dae6acb9dd59dc39432d4e4b3d86b79
4
+ data.tar.gz: 3f15a082f3c72d9a75df32e728cff7cc0ff88787
5
+ SHA512:
6
+ metadata.gz: 0e0dc797ebb1acab0781324a47ed6184f51797a1b3893da9ac069024dc9cfadd56f6dcb5c1338855e59c1c0a82745c601102fd62e4de58a198ed278332ac5f24
7
+ data.tar.gz: 84a5056c2c069512a99eb8b4d9b58c9dcb0827732ffe555f7708705da0579706be7659abbfc6f4b97e4e458b02e01d2a25c38489c240e0a166d1e2b316d87132
data/.cane ADDED
@@ -0,0 +1 @@
1
+ --style-measure 100
data/.gitignore ADDED
@@ -0,0 +1,3 @@
1
+ .bundle
2
+ Gemfile.lock
3
+ vendor/bundle
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --color
2
+ --format progress
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in hasta.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2014 Swipely, Inc.
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,150 @@
1
+ # Hasta
2
+
3
+ <b>HA</b>doop <b>S</b>treaming <b>T</b>est h<b>A</b>rness for Amazon EMR
4
+
5
+ A test harness for running [Hadoop Streaming](http://hadoop.apache.org/docs/r1.2.1/streaming.html) jobs written in Ruby without running Hadoop.
6
+ The test harness understands the [Amazon Data Pipeline](http://aws.amazon.com/datapipeline/) and [Elastic Map Reduce](http://aws.amazon.com/elasticmapreduce/) (EMR) JSON definition format and can automatically parse the details of a job out of Data Pipeline configuration file.
7
+
8
+ ## Installation
9
+
10
+ Add this line to your application's Gemfile:
11
+
12
+ gem 'hasta'
13
+
14
+ And then execute:
15
+
16
+ $ bundle
17
+
18
+ Or install it yourself as:
19
+
20
+ $ gem install hasta
21
+
22
+ ## Usage
23
+
24
+ 1. Require the Hasta Rake tasks in your project's Rakefile
25
+
26
+ ```ruby
27
+ require 'hasta/tasks'
28
+ ```
29
+
30
+ 2. Add the `Hasta::Tasks:Runner` task to your project's Rakefile
31
+
32
+ ```ruby
33
+ Hasta::Tasks::Runner.new do |task, opts|
34
+ task.definition_file = <path-to-AWS-datapipeline-definition-json-file>
35
+ task.job_id = opts[:job_id]
36
+ task.scheduled_start_time = Time.parse(opts[:scheduled_start_time])
37
+ task.project_root = File.dirname(__FILE__)
38
+ end
39
+ ```
40
+ 3. Run the test from the command line by calling:
41
+
42
+ ```
43
+ % rake runner[<job-id>]
44
+ ```
45
+ or
46
+ ```
47
+ % rake runner[<job-id>,<scheduled-start-time>]
48
+ ```
49
+ Where `job-id` is the id of the EMR job you are testing and `scheduled-start-time` is an [ISO 8601](http://en.wikipedia.org/wiki/ISO_8601)-formatted time specifying the time to use when interpolating any `@scheduledStartTime` variable references in your pipeline definition file.
50
+ If your pipeline definition file has no `@scheduledStartTime` variable references, there is no need to include a `scheduled-start-time` argument.
51
+
52
+ ## Configuration
53
+
54
+ The following code snippet illustrates how to update the global Hasta configuration, which values are mandatory, and which values have defaults.
55
+
56
+ ```ruby
57
+ Hasta.configure do |config|
58
+ # mandatory
59
+ config.project_root = nil
60
+
61
+ # optional
62
+ config.local_storage_root = '~/fog'
63
+ config.cache_storage_root = '~/.hasta'
64
+ config.project_steps = 'steps'
65
+ config.logger = Logger.new(STDOUT)
66
+ end
67
+ ```
68
+
69
+ ## Data
70
+
71
+ All of the data read and written by EMR jobs is stored in [S3](http://aws.amazon.com/s3/).
72
+ Hasta uses the S3 URIs contained in the Data Pipeline definition file for all of its reads and writes.
73
+ When reading data, it will first look on the local filesystem.
74
+ If the requested S3 URI is not found locally, it will look for it on S3.
75
+ Hasta never writes data to S3, only to the local filesytem.
76
+
77
+ Hasta uses [Fog](http://fog.io/)'s [local storage provider](https://github.com/fog/fog/blob/master/lib/fog/local/storage.rb) to read and write data to the local filesystem using S3 URIs.
78
+ The root directory for the local storage is controlled by the `Hasta.local_storage_root` configuration property, which is set to `~/fog` by default.
79
+
80
+ Hasta reads all of its input data from the S3 paths specified in the AWS datapipline definition file.
81
+ If you wish to use different data for an input, you need to put that data into the local directory that corresponds to the S3 path in the definition file.
82
+
83
+ To control which credentials Hasta is using when communicating with S3, update your `~/.fog` file or set the `FOG_CREDENTIAL` environment variable to the appropriate credential.
84
+
85
+ ## Data Filtering
86
+
87
+ Hasta automatically filters input data to minimize execution time.
88
+ By default, Hasta looks for a file in the current directory named `filter_config.txt` for the filtering configuration.
89
+ You can change the filter configuration file by setting the `HASTA_DATA_FILTER_FILE` environment variable.
90
+ If you want to disable data filtering, you can set the `HASTA_DATA_FILTERING` environment variable to `OFF`.
91
+
92
+ ### Filter Configuration
93
+
94
+ The filter configuration file contains regular expressions, one per line.
95
+ Any line of input data that matches at least one of the regular expressions is included in the test input.
96
+ Lines that do no match any of the regular expressions are excluded from the test input.
97
+
98
+ ### Caching
99
+
100
+ Hasta caches the filtered input data locally to improve performance.
101
+ The first time a data source is referenced in a test, the filtered results are written locally.
102
+ Subsequent tests that access the same data source with the same filter are read from the local cache.
103
+ This results in a significant speedup on subsequent runs when dealing with aggressively filtered large data sets.
104
+ By default, the files are written to the `~/.hasta/cache` directory, but this can be controlled using the `cache_storage_root` configuration setting.
105
+
106
+ ## Execution
107
+
108
+ Hasta sets up the environent variables specified by the `-cmdenv` switch in the job definition.
109
+ It also pulls down all cache files from S3 and stores them in local S3 storage.
110
+ For all cache files that do not have a `.rb` file extension, an environment variable is added to the `ENV` that points to the absolute path of the local file.
111
+
112
+ ### Example
113
+ Given the following `cacheFile` parameter:
114
+ ```
115
+ -cacheFile s3://my-bucket/path/to/abbreviations.json#abbreviations.json
116
+ ```
117
+
118
+ The following environment variable will be added to the `ENV`:
119
+ ```
120
+ ENV['ABBREVIATIONS_FILE_PATH'] #=> "#{Hasta.local_storage_root}/my-bucket/path/to/abbreviations.json"
121
+ ```
122
+
123
+ The parent directory of each cache file that has a `.rb` file extension is added to the `$LOAD_PATH`, so the mapper and reducer can use `require` statements to load the code in these files.
124
+
125
+ Hasta executes mappers and reducers in subprocesses.
126
+ This isolates each job and prevents the modifications to `ENV` and `$LOAD_PATH` described above from leaking into the parent process.
127
+
128
+ The output of the mapper is sorted before it is processed by the reducer.
129
+ The mapper output is sorted in ascending order, according to the natural sort order for Ruby Strings.
130
+
131
+ ## Requirements
132
+
133
+ ### Mappers and Reducers
134
+ 1. must be stand-alone Ruby scripts
135
+ 2. must be defined in the `Hasta.project_root`/`Hasta.project_steps` directory
136
+ 3. must read their input lines from stdin and write their output lines to stdout
137
+ 4. any data written to stderr will be logged at the error level using `Hasta.logger`
138
+
139
+
140
+ ## Contributing
141
+
142
+ 1. Fork it
143
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
144
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
145
+ 4. Push to the branch (`git push origin my-new-feature`)
146
+ 5. Create new Pull Request
147
+
148
+ ## License
149
+
150
+ Copyright (c) 2014 Swipely, Inc. See [LICENSE.txt](https://github.com/swipely/hasta/blob/master/LICENSE.txt) for further details.
data/Rakefile ADDED
@@ -0,0 +1,15 @@
1
+ require 'rake'
2
+ require 'cane/rake_task'
3
+ require 'rspec/core/rake_task'
4
+
5
+ RSpec::Core::RakeTask.new do |t|
6
+ t.pattern = File.join(File.dirname(__FILE__), 'spec', '**', '*_spec.rb')
7
+ end
8
+
9
+ Cane::RakeTask.new('quality') do |cane|
10
+ cane.canefile = '.cane'
11
+ end
12
+
13
+ task :all => [:quality, :spec]
14
+
15
+ task :default => :all
data/hasta.gemspec ADDED
@@ -0,0 +1,29 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'hasta/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "hasta"
8
+ spec.version = Hasta::VERSION
9
+ spec.authors = ["danhodge"]
10
+ spec.email = ["dan@swipely.com"]
11
+ spec.summary = %q{HAdoop Streaming Test hArness}
12
+ spec.description = %q{Harness for locally testing streaming Hadoop jobs written in Ruby}
13
+ spec.homepage = ""
14
+ spec.license = "MIT"
15
+
16
+ spec.files = `git ls-files -z`.split("\x0")
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_dependency "fog"
22
+ spec.add_dependency "json"
23
+
24
+ spec.add_development_dependency "bundler", "~> 1.5"
25
+ spec.add_development_dependency "cane"
26
+ spec.add_development_dependency "pry"
27
+ spec.add_development_dependency "rake"
28
+ spec.add_development_dependency "rspec"
29
+ end
data/lib/hasta.rb ADDED
@@ -0,0 +1,46 @@
1
+ # Copyright Swipely, Inc. All rights reserved.
2
+
3
+ require "hasta/version"
4
+ require "hasta/configuration"
5
+
6
+ require "forwardable"
7
+
8
+ # The HAdoop Streaming Test hArness
9
+ module Hasta
10
+ extend self
11
+ extend Forwardable
12
+
13
+ Error = Class.new(StandardError)
14
+ NonExistentPath = Class.new(Error)
15
+ ClassLoadError = Class.new(Error)
16
+ ExecutionError = Class.new(Error)
17
+ ConfigurationError = Class.new(Error)
18
+
19
+ DELEGATED_ATTRS = [
20
+ :combined_storage,
21
+ :local_storage_root,
22
+ :logger,
23
+ :project_root,
24
+ :project_steps,
25
+ ]
26
+
27
+ def_delegators :config, *DELEGATED_ATTRS
28
+
29
+ def configure
30
+ yield config
31
+ end
32
+
33
+ def tab_separated_line(line)
34
+ if line.include?("\t")
35
+ line
36
+ else
37
+ "#{line}\t"
38
+ end
39
+ end
40
+
41
+ private
42
+
43
+ def config
44
+ @config ||= Configuration.new
45
+ end
46
+ end
@@ -0,0 +1,21 @@
1
+ # Copyright Swipely, Inc. All rights reserved.
2
+
3
+ require 'delegate'
4
+
5
+ module Hasta
6
+ # Implements Hasta's S3File interface for files retrieved from the cache
7
+ class CachedS3File < SimpleDelegator
8
+ def initialize(cached_file, s3_uri)
9
+ super(S3File.wrap(cached_file))
10
+ @s3_uri = s3_uri
11
+ end
12
+
13
+ def key
14
+ @s3_uri.path
15
+ end
16
+
17
+ def s3_uri
18
+ @s3_uri
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,35 @@
1
+ # Copyright Swipely, Inc. All rights reserved.
2
+
3
+ module Hasta
4
+ # Combines multiple data sources so they can be iterated over continuously
5
+ class CombinedDataSource
6
+ attr_reader :name
7
+
8
+ def initialize(sources, name = nil)
9
+ @sources = sources
10
+ @name = name || sources.map(&:name).compact.join(', ')
11
+ end
12
+
13
+ def each_line
14
+ return enum_for(:each_line) unless block_given?
15
+
16
+ sources.each do |source|
17
+ source.each_line do |line|
18
+ yield line
19
+ end
20
+ end
21
+ end
22
+
23
+ def to_a
24
+ each_line.to_a
25
+ end
26
+
27
+ def to_s
28
+ "#<#{self.class.name}:#{name}>"
29
+ end
30
+
31
+ private
32
+
33
+ attr_reader :sources
34
+ end
35
+ end
@@ -0,0 +1,30 @@
1
+ # Copyright Swipely, Inc. All rights reserved.
2
+
3
+ require 'hasta/s3_uri'
4
+ require 'hasta/storage'
5
+
6
+ module Hasta
7
+ # The file storage interface used by the local map/reduce jobs
8
+ class CombinedStorage
9
+ def initialize(s3_storage, local_storage)
10
+ @s3_storage = s3_storage
11
+ @local_storage = local_storage
12
+ end
13
+
14
+ def files_for(s3_uri)
15
+ if local_storage.exists?(s3_uri)
16
+ local_storage.files_for(s3_uri)
17
+ else
18
+ s3_storage.files_for(s3_uri)
19
+ end
20
+ end
21
+
22
+ def write(s3_uri, data_source)
23
+ local_storage.write(s3_uri, data_source)
24
+ end
25
+
26
+ private
27
+
28
+ attr_reader :s3_storage, :local_storage
29
+ end
30
+ end
@@ -0,0 +1,88 @@
1
+ # Copyright Swipely, Inc. All rights reserved.
2
+
3
+ require 'fog'
4
+ require 'logger'
5
+
6
+ require 'hasta/local_storage'
7
+ require 'hasta/s3_storage'
8
+ require 'hasta/combined_storage'
9
+ require 'hasta/filter'
10
+ require 'hasta/resolve_cached_s3_file'
11
+ require 'hasta/resolve_filtered_s3_file'
12
+
13
+ module Hasta
14
+ # Global configuration settings
15
+ class Configuration
16
+ attr_accessor :project_root
17
+ attr_writer :local_storage_root, :cache_storage_root, :project_steps, :logger, :filter
18
+
19
+ def local_storage_root
20
+ @local_storage_root ||= '~/fog'
21
+ end
22
+
23
+ def cache_storage_root
24
+ @cache_storage_root ||= '~/.hasta'
25
+ end
26
+
27
+ def project_steps
28
+ @project_steps ||= 'steps'
29
+ end
30
+
31
+ def logger
32
+ @logger ||= Logger.new(STDOUT)
33
+ end
34
+
35
+ def project_steps_dir
36
+ File.join(project_root, project_steps)
37
+ end
38
+
39
+ def combined_storage
40
+ @combined_storage ||= CombinedStorage.new(
41
+ S3Storage.new(fog_s3_storage, resolver),
42
+ LocalStorage.new(fog_local_storage, resolver)
43
+ )
44
+ end
45
+
46
+ def filter
47
+ unless @filter || ENV['HASTA_DATA_FILTERING'] == 'OFF'
48
+ filter_file = ENV['HASTA_DATA_FILTER_FILE'] || 'filter_config.txt'
49
+ @filter ||= Filter.from_file(filter_file)
50
+ end
51
+
52
+ @filter
53
+ end
54
+
55
+ private
56
+
57
+ def fog_s3_storage
58
+ # Use FOG_CREDENTIAL env variable to control AWS credentials
59
+ @fog_s3_storage ||= Fog::Storage::AWS.new
60
+ end
61
+
62
+ def fog_local_storage
63
+ @fog_local_storage ||= local_fog(local_storage_root)
64
+ end
65
+
66
+ def fog_cache_storage
67
+ @fog_cache_storage ||= local_fog(cache_storage_root)
68
+ end
69
+
70
+ def local_fog(local_root)
71
+ Fog::Storage.new(
72
+ :provider => 'Local',
73
+ :local_root => local_root,
74
+ :endpoint => 'http://example.com'
75
+ )
76
+ end
77
+
78
+ def resolver
79
+ if filter
80
+ ResolveCachedS3File.new(
81
+ S3FileCache.new(fog_cache_storage), ResolveFilteredS3File.new(filter)
82
+ )
83
+ else
84
+ Hasta::Storage::ResolveS3File
85
+ end
86
+ end
87
+ end
88
+ end