hasta 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. checksums.yaml +7 -0
  2. data/.cane +1 -0
  3. data/.gitignore +3 -0
  4. data/.rspec +2 -0
  5. data/Gemfile +4 -0
  6. data/LICENSE.txt +20 -0
  7. data/README.md +150 -0
  8. data/Rakefile +15 -0
  9. data/hasta.gemspec +29 -0
  10. data/lib/hasta.rb +46 -0
  11. data/lib/hasta/cached_s3_file.rb +21 -0
  12. data/lib/hasta/combined_data_source.rb +35 -0
  13. data/lib/hasta/combined_storage.rb +30 -0
  14. data/lib/hasta/configuration.rb +88 -0
  15. data/lib/hasta/emr_job_definition.rb +104 -0
  16. data/lib/hasta/emr_node.rb +103 -0
  17. data/lib/hasta/env.rb +35 -0
  18. data/lib/hasta/execution_context.rb +90 -0
  19. data/lib/hasta/filter.rb +40 -0
  20. data/lib/hasta/filtered_s3_file.rb +34 -0
  21. data/lib/hasta/identity_mapper.rb +17 -0
  22. data/lib/hasta/identity_reducer.rb +18 -0
  23. data/lib/hasta/in_memory_data_sink.rb +40 -0
  24. data/lib/hasta/in_memory_data_source.rb +35 -0
  25. data/lib/hasta/interpolate_string.rb +45 -0
  26. data/lib/hasta/local_file_path.rb +12 -0
  27. data/lib/hasta/local_storage.rb +41 -0
  28. data/lib/hasta/mapper.rb +23 -0
  29. data/lib/hasta/reducer.rb +29 -0
  30. data/lib/hasta/resolve_cached_s3_file.rb +29 -0
  31. data/lib/hasta/resolve_filtered_s3_file.rb +22 -0
  32. data/lib/hasta/runner.rb +32 -0
  33. data/lib/hasta/s3_data_sink.rb +48 -0
  34. data/lib/hasta/s3_data_source.rb +41 -0
  35. data/lib/hasta/s3_file.rb +56 -0
  36. data/lib/hasta/s3_file_cache.rb +23 -0
  37. data/lib/hasta/s3_storage.rb +21 -0
  38. data/lib/hasta/s3_uri.rb +60 -0
  39. data/lib/hasta/sorted_data_source.rb +36 -0
  40. data/lib/hasta/storage.rb +82 -0
  41. data/lib/hasta/tasks.rb +8 -0
  42. data/lib/hasta/tasks/runner.rb +84 -0
  43. data/lib/hasta/version.rb +3 -0
  44. data/spec/fixtures/hasta/filter_config.txt +1 -0
  45. data/spec/fixtures/hasta/json/emr_node.json +10 -0
  46. data/spec/fixtures/hasta/json/pipeline_definition.json +135 -0
  47. data/spec/fixtures/hasta/lib/failing_mapper.rb +19 -0
  48. data/spec/fixtures/hasta/lib/test_env_mapper.rb +20 -0
  49. data/spec/fixtures/hasta/lib/test_identity_mapper.rb +20 -0
  50. data/spec/fixtures/hasta/lib/test_types_mapper.rb +21 -0
  51. data/spec/fixtures/hasta/lib/types.rb +1 -0
  52. data/spec/fixtures/hasta/lib/unconventional_reducer.rb +17 -0
  53. data/spec/hasta/combined_data_source_spec.rb +25 -0
  54. data/spec/hasta/combined_storage_spec.rb +54 -0
  55. data/spec/hasta/configuration_spec.rb +49 -0
  56. data/spec/hasta/emr_job_definition_spec.rb +181 -0
  57. data/spec/hasta/emr_node_spec.rb +32 -0
  58. data/spec/hasta/env_spec.rb +30 -0
  59. data/spec/hasta/execution_context_spec.rb +67 -0
  60. data/spec/hasta/filter_spec.rb +66 -0
  61. data/spec/hasta/filtered_s3_file_spec.rb +45 -0
  62. data/spec/hasta/identity_mapper_spec.rb +22 -0
  63. data/spec/hasta/identity_reducer_spec.rb +20 -0
  64. data/spec/hasta/interpolate_string_spec.rb +44 -0
  65. data/spec/hasta/local_file_path_spec.rb +18 -0
  66. data/spec/hasta/local_storage_spec.rb +52 -0
  67. data/spec/hasta/mapper_spec.rb +26 -0
  68. data/spec/hasta/reducer_spec.rb +26 -0
  69. data/spec/hasta/resolved_cached_s3_file_spec.rb +68 -0
  70. data/spec/hasta/s3_data_source_spec.rb +39 -0
  71. data/spec/hasta/s3_file_cache_spec.rb +45 -0
  72. data/spec/hasta/s3_file_spec.rb +122 -0
  73. data/spec/hasta/s3_storage_spec.rb +24 -0
  74. data/spec/hasta/s3_uri_spec.rb +151 -0
  75. data/spec/hasta/sorted_data_source_spec.rb +22 -0
  76. data/spec/spec_helper.rb +24 -0
  77. data/spec/support/shared_contexts/hasta/local_fog_storage.rb +17 -0
  78. data/spec/support/shared_examples/hasta/storage_examples.rb +103 -0
  79. metadata +254 -0
@@ -0,0 +1,104 @@
1
+ # Copyright Swipely, Inc. All rights reserved.
2
+
3
+ require 'forwardable'
4
+ require 'json'
5
+
6
+ require 'hasta/emr_node'
7
+ require 'hasta/env'
8
+ require 'hasta/identity_mapper'
9
+ require 'hasta/mapper'
10
+ require 'hasta/reducer'
11
+ require 'hasta/identity_reducer'
12
+ require 'hasta/s3_data_source'
13
+ require 'hasta/s3_uri'
14
+ require 'hasta/s3_data_sink'
15
+
16
+ module Hasta
17
+ # Defines the EMR job that is being tested
18
+ class EmrJobDefinition
19
+ extend Forwardable
20
+
21
+ def self.load(file_path, id, scheduled_start_time = Time.now)
22
+ emr_node = JSON.parse(File.read(file_path))['objects'].find { |node|
23
+ node['type'] == 'EmrActivity' && node['id'] == id
24
+ }
25
+
26
+ raise ArgumentError, "No EmrActivity for id: #{id} in file: #{file_path}" unless emr_node
27
+ new(EmrNode.from_json(emr_node, scheduled_start_time))
28
+ end
29
+
30
+ def_delegators :emr_node, :id
31
+
32
+ def initialize(emr_node)
33
+ @emr_node = emr_node
34
+ end
35
+
36
+ def input_paths
37
+ @input_paths ||= emr_node.input_paths.map { |path| S3URI.parse(path) }
38
+ end
39
+
40
+ def output_path
41
+ @output_path ||= S3URI.parse(emr_node.output_path)
42
+ end
43
+
44
+ def env
45
+ @env ||= Env.new(
46
+ emr_node.env,
47
+ Hash[
48
+ emr_node.
49
+ cache_files.
50
+ reject { |tag, uri| uri.end_with?('.rb') }.
51
+ map { |tag, uri| ["#{tag.split('.').first.upcase}_FILE_PATH", S3URI.parse(uri)] }
52
+ ]
53
+ )
54
+ end
55
+
56
+ def ruby_files
57
+ @ruby_files ||= emr_node.
58
+ cache_files.
59
+ values.
60
+ select { |uri| uri.end_with?('.rb') }.
61
+ map { |uri| local_path_to_step_file(S3URI.parse(uri)) }
62
+ end
63
+
64
+ def mapper
65
+ @mapper ||= parse_mapper(emr_node.mapper)
66
+ end
67
+
68
+ def reducer
69
+ @reducer ||= parse_reducer(emr_node.reducer)
70
+ end
71
+
72
+ def data_sources
73
+ @data_sources ||= input_paths.map { |path| S3DataSource.new(path) }
74
+ end
75
+
76
+ def data_sink
77
+ @data_sink ||= S3DataSink.new(output_path)
78
+ end
79
+
80
+ private
81
+
82
+ attr_reader :emr_node
83
+
84
+ def local_path_to_step_file(s3_uri)
85
+ File.join(Hasta.project_root, Hasta.project_steps, s3_uri.basename)
86
+ end
87
+
88
+ def parse_mapper(mapper_command)
89
+ if %w[cat org.apache.hadoop.mapred.lib.IdentityMapper].include?(mapper_command)
90
+ IdentityMapper
91
+ else
92
+ Mapper.new(local_path_to_step_file(S3URI.parse(mapper_command)))
93
+ end
94
+ end
95
+
96
+ def parse_reducer(reducer_command)
97
+ if %w[cat org.apache.hadoop.mapred.lib.IdentityReducer].include?(reducer_command)
98
+ IdentityReducer
99
+ else
100
+ Reducer.new(local_path_to_step_file(S3URI.parse(reducer_command)))
101
+ end
102
+ end
103
+ end
104
+ end
@@ -0,0 +1,103 @@
1
+ # Copyright Swipely, Inc. All rights reserved.
2
+
3
+ require 'hasta/interpolate_string'
4
+
5
+ module Hasta
6
+ # Models the Amazon Data Pipeline configuration details for the EMR job that is being tested
7
+ class EmrNode
8
+ class << self
9
+ def from_json(json, scheduled_start_time = Time.now)
10
+ command_line = parse_step_line(json['step'])
11
+
12
+ new(
13
+ :id => json['id'],
14
+ :input_paths => command_line['input'],
15
+ :output_path => command_line['output'].first,
16
+ :mapper => command_line['mapper'].first,
17
+ :reducer => command_line['reducer'].first,
18
+ :cache_files => command_line['cacheFile'],
19
+ :env => command_line['cmdenv'],
20
+ :scheduled_start_time => scheduled_start_time
21
+ )
22
+ end
23
+
24
+ private
25
+
26
+ # Parses the 'step' attribute of an EMR configuration into a Hash
27
+ # Sample step line:
28
+ # "/home/hadoop/contrib/streaming/hadoop-streaming.jar,
29
+ # -input,s3n://data-bucket/input1/,
30
+ # -output,s3://data-bucket/output/,
31
+ # -mapper,cat,
32
+ # -reducer,s3n://steps-bucket/path/to/reducer.rb,
33
+ # -cacheFile,s3://data-bucket/path/to/mappings.yml#mappings.yml,
34
+ # -cacheFile,s3://data-bucket/path/to/ignored.yml#ignored.yml,
35
+ # -cmdenv,API_KEY=123456,
36
+ # -cmdenv,ENVIRONMENT_NAME=uat"
37
+ #
38
+ # Sample output:
39
+ # {
40
+ # "input" => ["s3n://data-bucket/input1/"],
41
+ # "output"=> ["s3://data-bucket/output/"],
42
+ # "mapper => ["cat"],
43
+ # "reducer" => ["s3n://steps-bucket/path/to/reducer.rb"],
44
+ # "cacheFile" => ["s3://data-bucket/path/to/mappings.yml#mappings.yml",
45
+ # "s3://data-bucket/path/to/ignored.yml#ignored.yml"],
46
+ # "cmdenv" => ["API_KEY=123456", "ENVIRONMENT_NAME=uat"]
47
+ # }
48
+ #
49
+ def parse_step_line(step)
50
+ parsed = Hash.new { |h, k| h[k] = [] }
51
+ step.
52
+ split(',-').
53
+ drop(1).
54
+ map { |value| i = value.index(','); [value[0...i], value[i+1..-1]] }.
55
+ each do |switch, arg|
56
+ parsed[switch] << arg
57
+ end
58
+
59
+ parsed
60
+ end
61
+ end
62
+
63
+ def initialize(attributes)
64
+ @attributes = attributes
65
+ end
66
+
67
+ def id
68
+ attributes[:id]
69
+ end
70
+
71
+ def input_paths
72
+ @input_path ||= attributes[:input_paths].map { |path| interpolate(path) }
73
+ end
74
+
75
+ def output_path
76
+ @output_path ||= interpolate(attributes[:output_path])
77
+ end
78
+
79
+ def mapper
80
+ attributes[:mapper]
81
+ end
82
+
83
+ def reducer
84
+ attributes[:reducer]
85
+ end
86
+
87
+ def cache_files
88
+ @cache_files ||= Hash[attributes[:cache_files].map { |value| value.split('#').reverse }]
89
+ end
90
+
91
+ def env
92
+ @env ||= Hash[attributes[:env].map { |value| value.split('=') }]
93
+ end
94
+
95
+ private
96
+
97
+ attr_reader :attributes
98
+
99
+ def interpolate(path)
100
+ InterpolateString.evaluate(path, 'scheduledStartTime' => attributes[:scheduled_start_time])
101
+ end
102
+ end
103
+ end
data/lib/hasta/env.rb ADDED
@@ -0,0 +1,35 @@
1
+ # Copyright Swipely, Inc. All rights reserved.
2
+
3
+ require 'hasta/local_file_path'
4
+ require 'hasta/s3_data_source'
5
+
6
+ module Hasta
7
+ # Constructs the ENV variables required to run a local EMR job
8
+ class Env
9
+ attr_reader :variables, :files
10
+
11
+ def initialize(
12
+ variables = {},
13
+ files = {},
14
+ combined_storage = Hasta.combined_storage
15
+ )
16
+ @variables = variables
17
+ @files = files
18
+ @combined_storage = combined_storage
19
+ end
20
+
21
+ def setup
22
+ file_vars = {}
23
+ files.each do |key, s3_uri|
24
+ input_source = S3DataSource.new(s3_uri, combined_storage)
25
+ file_vars[key] = LocalFilePath.for(combined_storage.write(s3_uri, input_source))
26
+ end
27
+
28
+ variables.merge(file_vars)
29
+ end
30
+
31
+ private
32
+
33
+ attr_reader :combined_storage
34
+ end
35
+ end
@@ -0,0 +1,90 @@
1
+ # Copyright Swipely, Inc. All rights reserved.
2
+
3
+ require 'open3'
4
+
5
+ require 'hasta/s3_data_sink'
6
+
7
+ module Hasta
8
+ # Executes each local EMR job in isolation
9
+ class ExecutionContext
10
+ # A Subprocess
11
+ class Subprocess
12
+ attr_reader :stdin, :stdout, :stderr
13
+
14
+ def initialize(ruby_files, env)
15
+ @ruby_files = ruby_files
16
+ @env = env
17
+ end
18
+
19
+ def start(source_file)
20
+ Open3.popen3(*cmd_line(source_file)) do |stdin, stdout, stderr, wait_thr|
21
+ @stdin, @stdout, @stderr, @wait_thr = stdin, stdout, stderr, wait_thr
22
+
23
+ yield self
24
+
25
+ if (exit_code = wait_thr.value.exitstatus) != 0
26
+ raise ExecutionError, "#{source_file} exited with non-zero status: #{exit_code}"
27
+ end
28
+ end
29
+ end
30
+
31
+ private
32
+
33
+ attr_reader :source_file, :env, :ruby_files
34
+
35
+ def cmd_line(source_file)
36
+ [env, ruby_exe_path] + load_path + [source_file]
37
+ end
38
+
39
+ def ruby_exe_path
40
+ File.join(RbConfig::CONFIG['bindir'], RbConfig::CONFIG['ruby_install_name'])
41
+ end
42
+
43
+ def load_path
44
+ ruby_files.
45
+ map { |file| File.expand_path(File.dirname(file)) }.
46
+ uniq.
47
+ map { |path| ['-I', path] }.
48
+ flatten
49
+ end
50
+ end
51
+
52
+ def initialize(ruby_files = [], env = {})
53
+ @sub_process = Subprocess.new(ruby_files, env)
54
+ end
55
+
56
+ def execute(source_file, data_source, data_sink)
57
+ sub_process.start(source_file) do |sub_process|
58
+ [
59
+ stream_input(data_source, sub_process.stdin),
60
+ stream_output(sub_process.stdout) { |line| data_sink << line },
61
+ stream_output(sub_process.stderr) { |line| Hasta.logger.error line },
62
+ ].each(&:join)
63
+ end
64
+
65
+ data_sink.close
66
+ end
67
+
68
+ private
69
+
70
+ attr_reader :sub_process
71
+
72
+ def stream_input(data_source, io)
73
+ Thread.new do
74
+ data_source.each_line do |line|
75
+ io.puts line
76
+ end
77
+
78
+ io.close_write
79
+ end
80
+ end
81
+
82
+ def stream_output(io)
83
+ Thread.new do
84
+ StringIO.new(io.read).each_line do |line|
85
+ yield line.rstrip
86
+ end
87
+ end
88
+ end
89
+ end
90
+ end
@@ -0,0 +1,40 @@
1
+ # Copyright Swipely, Inc. All rights reserved.
2
+
3
+ require 'set'
4
+
5
+ module Hasta
6
+ # The filter that is used to drop unwanted lines from input files
7
+ class Filter
8
+ def self.from_file(file)
9
+ if lines = File.read(file)
10
+ Hasta.logger.debug "Loading data filter file: #{File.expand_path(file)}"
11
+ new(*lines.split("\n").map { |line| Regexp.new(line) })
12
+ end
13
+ rescue => ex
14
+ raise ConfigurationError.new,
15
+ "Failed to load filter configuration file: #{file} - #{ex.message}"
16
+ end
17
+
18
+ def initialize(*accept_regexes)
19
+ @accept_regexes = Set.new(accept_regexes)
20
+ end
21
+
22
+ def include?(line)
23
+ to_proc.call(line)
24
+ end
25
+
26
+ def to_proc
27
+ @proc ||= Proc.new { |line| !!(accept_regexes.find { |regex| line =~ regex }) }
28
+ end
29
+
30
+ def to_s
31
+ "#<#{self.class.name}:#{accept_regexes.to_a.inspect}>"
32
+ end
33
+
34
+ private
35
+
36
+ def accept_regexes
37
+ @accept_regexes.to_a.sort_by(&:inspect)
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,34 @@
1
+ # Copyright Swipely, Inc. All rights reserved.
2
+
3
+ require 'delegate'
4
+ require 'digest/md5'
5
+
6
+ module Hasta
7
+ # An S3File delegate that drops filtered lines
8
+ class FilteredS3File < SimpleDelegator
9
+ def initialize(s3_file, filter)
10
+ super(s3_file)
11
+ @filter = filter
12
+ end
13
+
14
+ def body
15
+ each_line.to_a.join
16
+ end
17
+
18
+ def fingerprint
19
+ @fingerprint ||= Digest::MD5.hexdigest("#{__getobj__.fingerprint}_#{filter.to_s}")
20
+ end
21
+
22
+ def each_line
23
+ return enum_for(:each_line) unless block_given?
24
+
25
+ __getobj__.each_line do |line|
26
+ yield line if filter.include?(line)
27
+ end
28
+ end
29
+
30
+ private
31
+
32
+ attr_reader :filter
33
+ end
34
+ end
@@ -0,0 +1,17 @@
1
+ # Copyright Swipely, Inc. All rights reserved.
2
+
3
+ require 'hasta/combined_data_source'
4
+
5
+ module Hasta
6
+ # Used by any EMR job that required an identity mapper
7
+ module IdentityMapper
8
+ def self.map(_, data_sources, data_sink = InMemoryDataSink.new)
9
+ Hasta.logger.debug "Starting Identity Mapper"
10
+ CombinedDataSource.new(data_sources).each_line do |line|
11
+ data_sink << line.rstrip
12
+ end
13
+
14
+ data_sink.close.tap { Hasta.logger.debug "Finished Identity Mapper" }
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,18 @@
1
+ # Copyright Swipely, Inc. All rights reserved.
2
+
3
+ require 'hasta/in_memory_data_sink'
4
+ require 'hasta/sorted_data_source'
5
+
6
+ module Hasta
7
+ # Used by any EMR job that requires an identity reducer
8
+ module IdentityReducer
9
+ def self.reduce(_, data_source, data_sink = InMemoryDataSink.new)
10
+ Hasta.logger.debug "Starting Identity Reducer"
11
+ SortedDataSource.new(data_source).each_line do |line|
12
+ data_sink << line.rstrip
13
+ end
14
+
15
+ data_sink.close.tap { Hasta.logger.debug "Finished Identity Reducer" }
16
+ end
17
+ end
18
+ end