hasta 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. checksums.yaml +7 -0
  2. data/.cane +1 -0
  3. data/.gitignore +3 -0
  4. data/.rspec +2 -0
  5. data/Gemfile +4 -0
  6. data/LICENSE.txt +20 -0
  7. data/README.md +150 -0
  8. data/Rakefile +15 -0
  9. data/hasta.gemspec +29 -0
  10. data/lib/hasta.rb +46 -0
  11. data/lib/hasta/cached_s3_file.rb +21 -0
  12. data/lib/hasta/combined_data_source.rb +35 -0
  13. data/lib/hasta/combined_storage.rb +30 -0
  14. data/lib/hasta/configuration.rb +88 -0
  15. data/lib/hasta/emr_job_definition.rb +104 -0
  16. data/lib/hasta/emr_node.rb +103 -0
  17. data/lib/hasta/env.rb +35 -0
  18. data/lib/hasta/execution_context.rb +90 -0
  19. data/lib/hasta/filter.rb +40 -0
  20. data/lib/hasta/filtered_s3_file.rb +34 -0
  21. data/lib/hasta/identity_mapper.rb +17 -0
  22. data/lib/hasta/identity_reducer.rb +18 -0
  23. data/lib/hasta/in_memory_data_sink.rb +40 -0
  24. data/lib/hasta/in_memory_data_source.rb +35 -0
  25. data/lib/hasta/interpolate_string.rb +45 -0
  26. data/lib/hasta/local_file_path.rb +12 -0
  27. data/lib/hasta/local_storage.rb +41 -0
  28. data/lib/hasta/mapper.rb +23 -0
  29. data/lib/hasta/reducer.rb +29 -0
  30. data/lib/hasta/resolve_cached_s3_file.rb +29 -0
  31. data/lib/hasta/resolve_filtered_s3_file.rb +22 -0
  32. data/lib/hasta/runner.rb +32 -0
  33. data/lib/hasta/s3_data_sink.rb +48 -0
  34. data/lib/hasta/s3_data_source.rb +41 -0
  35. data/lib/hasta/s3_file.rb +56 -0
  36. data/lib/hasta/s3_file_cache.rb +23 -0
  37. data/lib/hasta/s3_storage.rb +21 -0
  38. data/lib/hasta/s3_uri.rb +60 -0
  39. data/lib/hasta/sorted_data_source.rb +36 -0
  40. data/lib/hasta/storage.rb +82 -0
  41. data/lib/hasta/tasks.rb +8 -0
  42. data/lib/hasta/tasks/runner.rb +84 -0
  43. data/lib/hasta/version.rb +3 -0
  44. data/spec/fixtures/hasta/filter_config.txt +1 -0
  45. data/spec/fixtures/hasta/json/emr_node.json +10 -0
  46. data/spec/fixtures/hasta/json/pipeline_definition.json +135 -0
  47. data/spec/fixtures/hasta/lib/failing_mapper.rb +19 -0
  48. data/spec/fixtures/hasta/lib/test_env_mapper.rb +20 -0
  49. data/spec/fixtures/hasta/lib/test_identity_mapper.rb +20 -0
  50. data/spec/fixtures/hasta/lib/test_types_mapper.rb +21 -0
  51. data/spec/fixtures/hasta/lib/types.rb +1 -0
  52. data/spec/fixtures/hasta/lib/unconventional_reducer.rb +17 -0
  53. data/spec/hasta/combined_data_source_spec.rb +25 -0
  54. data/spec/hasta/combined_storage_spec.rb +54 -0
  55. data/spec/hasta/configuration_spec.rb +49 -0
  56. data/spec/hasta/emr_job_definition_spec.rb +181 -0
  57. data/spec/hasta/emr_node_spec.rb +32 -0
  58. data/spec/hasta/env_spec.rb +30 -0
  59. data/spec/hasta/execution_context_spec.rb +67 -0
  60. data/spec/hasta/filter_spec.rb +66 -0
  61. data/spec/hasta/filtered_s3_file_spec.rb +45 -0
  62. data/spec/hasta/identity_mapper_spec.rb +22 -0
  63. data/spec/hasta/identity_reducer_spec.rb +20 -0
  64. data/spec/hasta/interpolate_string_spec.rb +44 -0
  65. data/spec/hasta/local_file_path_spec.rb +18 -0
  66. data/spec/hasta/local_storage_spec.rb +52 -0
  67. data/spec/hasta/mapper_spec.rb +26 -0
  68. data/spec/hasta/reducer_spec.rb +26 -0
  69. data/spec/hasta/resolved_cached_s3_file_spec.rb +68 -0
  70. data/spec/hasta/s3_data_source_spec.rb +39 -0
  71. data/spec/hasta/s3_file_cache_spec.rb +45 -0
  72. data/spec/hasta/s3_file_spec.rb +122 -0
  73. data/spec/hasta/s3_storage_spec.rb +24 -0
  74. data/spec/hasta/s3_uri_spec.rb +151 -0
  75. data/spec/hasta/sorted_data_source_spec.rb +22 -0
  76. data/spec/spec_helper.rb +24 -0
  77. data/spec/support/shared_contexts/hasta/local_fog_storage.rb +17 -0
  78. data/spec/support/shared_examples/hasta/storage_examples.rb +103 -0
  79. metadata +254 -0
@@ -0,0 +1,40 @@
1
+ # Copyright Swipely, Inc. All rights reserved.
2
+
3
+ require 'hasta/in_memory_data_source'
4
+
5
+ module Hasta
6
+ # Data sink for writing data in-memory
7
+ class InMemoryDataSink
8
+ attr_reader :name
9
+
10
+ def initialize(name=nil)
11
+ @name = name
12
+ end
13
+
14
+ def <<(line)
15
+ lines << Hasta.tab_separated_line(line)
16
+ end
17
+
18
+ def data_source
19
+ InMemoryDataSource.new(lines, name)
20
+ end
21
+
22
+ def close
23
+ self
24
+ end
25
+
26
+ def to_s
27
+ if name
28
+ "#<#{self.class.name}:#{name} (#{lines.count} lines)>"
29
+ else
30
+ super
31
+ end
32
+ end
33
+
34
+ private
35
+
36
+ def lines
37
+ @lines ||= []
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,35 @@
1
+ # Copyright Swipely, Inc. All rights reserved.
2
+
3
+ require 'forwardable'
4
+
5
+ module Hasta
6
+ # Data source for reading data from memory
7
+ class InMemoryDataSource
8
+ attr_reader :name
9
+
10
+ def initialize(lines, name=nil)
11
+ @lines = lines
12
+ @name = name
13
+ end
14
+
15
+ def each_line
16
+ return enum_for(:each_line) unless block_given?
17
+
18
+ lines.each do |line|
19
+ yield line
20
+ end
21
+ end
22
+
23
+ def to_a
24
+ lines
25
+ end
26
+
27
+ def to_s
28
+ "#<#{self.class.name}:#{name} size=#{lines.count} lines>"
29
+ end
30
+
31
+ private
32
+
33
+ attr_reader :lines
34
+ end
35
+ end
@@ -0,0 +1,45 @@
1
+ # Copyright Swipely, Inc. All rights reserved.
2
+
3
+ module Hasta
4
+ # Interpolates scheduled start time expressions in S3 path strings
5
+ class InterpolateString
6
+ INTERPOLATE_PATTERN = /\#\{format\(@scheduledStartTime,'(.*?)'\)\}/
7
+
8
+ PATTERN_CONVERSIONS = {
9
+ 'YYYY' => '%Y',
10
+ 'MM' => '%m',
11
+ 'dd' => '%d',
12
+ 'HH' => '%H',
13
+ 'mm' => '%M',
14
+ 'ss' => '%S',
15
+ }
16
+
17
+ def self.evaluate(pattern, context)
18
+ new(pattern).evaluate(context)
19
+ end
20
+
21
+ def initialize(pattern)
22
+ @pattern = pattern
23
+ end
24
+
25
+ def evaluate(context)
26
+ pattern.gsub(INTERPOLATE_PATTERN) do |match|
27
+ format(context, Regexp.last_match[1])
28
+ end
29
+ end
30
+
31
+ private
32
+
33
+ attr_reader :pattern
34
+
35
+ def format(context, pattern)
36
+ context['scheduledStartTime'].strftime(convert_pattern(pattern))
37
+ end
38
+
39
+ def convert_pattern(pattern)
40
+ PATTERN_CONVERSIONS.inject(pattern) { |converted_pattern, (pipeline_pattern, ruby_pattern)|
41
+ converted_pattern.gsub(pipeline_pattern, ruby_pattern)
42
+ }
43
+ end
44
+ end
45
+ end
@@ -0,0 +1,12 @@
1
+ # Copyright Swipely, Inc. All rights reserved.
2
+
3
+ require 'hasta/s3_uri'
4
+
5
+ module Hasta
6
+ # Resolves the local file path for an S3 URI
7
+ module LocalFilePath
8
+ def self.for(s3_uri)
9
+ File.expand_path(File.join(Hasta.local_storage_root, s3_uri.bucket, s3_uri.path))
10
+ end
11
+ end
12
+ end
@@ -0,0 +1,41 @@
1
+ # Copyright Swipely, Inc. All rights reserved.
2
+
3
+ require 'fog'
4
+
5
+ require 'hasta/s3_uri'
6
+ require 'hasta/storage'
7
+
8
+ module Hasta
9
+ # The read/write file storage interface to the local representation of the S3 data used
10
+ # by the local map/reduce jobs
11
+ class LocalStorage
12
+ include Storage
13
+
14
+ def write(s3_uri, data_source)
15
+ contents = StringIO.new
16
+ data_source.each_line do |line|
17
+ contents << line
18
+ end
19
+
20
+ write_to(
21
+ s3_uri.file? ? s3_uri : s3_uri.append('part-00000'),
22
+ contents.string
23
+ )
24
+ end
25
+
26
+ private
27
+
28
+ def write_to(s3_uri, contents)
29
+ write_bucket = bucket(s3_uri) || create_bucket(s3_uri.bucket)
30
+ file = create_file(write_bucket, s3_uri.path, contents)
31
+
32
+ s3_uri
33
+ end
34
+
35
+ def fog_files(s3_bucket, s3_uri)
36
+ s3_bucket.files.select { |file|
37
+ file.key.start_with?(s3_uri.path) && file_s3_uri(file).parent == s3_uri
38
+ }
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,23 @@
1
+ # Copyright Swipely, Inc. All rights reserved.
2
+
3
+ require 'hasta/combined_data_source'
4
+ require 'hasta/execution_context'
5
+ require 'hasta/in_memory_data_sink'
6
+
7
+ module Hasta
8
+ # A wrapper for instantiating a mapper from a definition file and invoking it
9
+ class Mapper
10
+ attr_reader :mapper_file
11
+
12
+ def initialize(mapper_file)
13
+ @mapper_file = mapper_file
14
+ end
15
+
16
+ def map(execution_context, data_sources, data_sink = InMemoryDataSink.new('Mapper Output'))
17
+ Hasta.logger.debug "Starting mapper: #{mapper_file}"
18
+
19
+ data_source = CombinedDataSource.new(data_sources)
20
+ execution_context.execute(mapper_file, data_source, data_sink)
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,29 @@
1
+ # Copyright Swipely, Inc. All rights reserved.
2
+
3
+ require 'hasta/execution_context'
4
+ require 'hasta/in_memory_data_sink'
5
+ require 'hasta/sorted_data_source'
6
+
7
+ module Hasta
8
+ # A wrapper for instantiating a reducer from a definition file and invoking it
9
+ class Reducer
10
+ attr_reader :reducer_file
11
+
12
+ def initialize(reducer_file)
13
+ @reducer_file = reducer_file
14
+ end
15
+
16
+ def reduce(execution_context, data_source, data_sink = InMemoryDataSink.new("Reducer Output"))
17
+ Hasta.logger.debug "Starting reducer: #{reducer_file}"
18
+ execution_context.execute(reducer_file, sorted_data_source(data_source), data_sink)
19
+
20
+ data_sink.close
21
+ end
22
+
23
+ private
24
+
25
+ def sorted_data_source(data_source)
26
+ SortedDataSource.new(data_source)
27
+ end
28
+ end
29
+ end
@@ -0,0 +1,29 @@
1
+ # Copyright Swipely, Inc. All rights reserved.
2
+
3
+ require 'hasta/cached_s3_file'
4
+ require 'hasta/s3_file_cache'
5
+
6
+ module Hasta
7
+ # Retrieves a file from the local cache instead of S3,
8
+ # or retrieves it from S3 and caches it locally
9
+ class ResolveCachedS3File
10
+ def initialize(file_cache, child_resolver)
11
+ @file_cache = file_cache
12
+ @child_resolver = child_resolver
13
+ end
14
+
15
+ def resolve(fog_file)
16
+ resolved = child_resolver.resolve(fog_file)
17
+ if cached_file = file_cache.get(resolved.fingerprint)
18
+ Hasta.logger.debug "Retrieved file: #{resolved.s3_uri} from local cache"
19
+ CachedS3File.new(cached_file, resolved.s3_uri)
20
+ else
21
+ file_cache.put(resolved.fingerprint, resolved.body)
22
+ Hasta.logger.debug "Cached file: #{resolved.s3_uri} locally"
23
+ resolved
24
+ end
25
+ end
26
+
27
+ attr_reader :file_cache, :child_resolver
28
+ end
29
+ end
@@ -0,0 +1,22 @@
1
+ # Copyright Swipely, Inc. All rights reserved.
2
+
3
+ require 'hasta/filtered_s3_file'
4
+
5
+ module Hasta
6
+ # Creates a Hasta filtered S3 file instance given a Fog file
7
+ class ResolveFilteredS3File
8
+ def initialize(filter, child_resolver = Hasta::Storage::ResolveS3File)
9
+ @filter = filter
10
+ @child_resolver = child_resolver
11
+ end
12
+
13
+ def resolve(fog_file)
14
+ FilteredS3File.new(child_resolver.resolve(fog_file), filter)
15
+ end
16
+
17
+
18
+ private
19
+
20
+ attr_reader :filter, :child_resolver
21
+ end
22
+ end
@@ -0,0 +1,32 @@
1
+ # Copyright Swipely, Inc. All rights reserved.
2
+
3
+ require 'hasta/env'
4
+ require 'hasta/execution_context'
5
+
6
+ module Hasta
7
+ # Runs a map/reduce job locally
8
+ class Runner
9
+ attr_reader :job_name
10
+
11
+ def initialize(job_name, mapper, reducer = nil)
12
+ @job_name = job_name
13
+ @mapper = mapper
14
+ @reducer = reducer
15
+ end
16
+
17
+ def run(data_sources, data_sink, ruby_files = [], env = Hasta::Env.new)
18
+ Hasta.logger.debug "Starting Job: #{job_name}"
19
+
20
+ context = ExecutionContext.new(ruby_files, env.setup)
21
+ if reducer
22
+ reducer.reduce(context, mapper.map(context, data_sources).data_source, data_sink)
23
+ else
24
+ mapper.map(context, data_sources, data_sink)
25
+ end
26
+ end
27
+
28
+ private
29
+
30
+ attr_reader :mapper, :reducer
31
+ end
32
+ end
@@ -0,0 +1,48 @@
1
+ # Copyright Swipely, Inc. All rights reserved.
2
+
3
+ require 'hasta/local_file_path'
4
+
5
+ module Hasta
6
+ # Data sink for writing data to S3 storage
7
+ class S3DataSink
8
+ attr_reader :s3_uri
9
+
10
+ def initialize(s3_uri, combined_storage = Hasta.combined_storage)
11
+ @s3_uri = s3_uri
12
+ @combined_storage = combined_storage
13
+ end
14
+
15
+ def <<(line)
16
+ lines << Hasta.tab_separated_line(line)
17
+ end
18
+
19
+ def close
20
+ storage_uri = combined_storage.write(s3_uri, contents)
21
+ Hasta.logger.debug(
22
+ "Wrote #{lines.count} lines to uri: #{storage_uri} (#{LocalFilePath.for(storage_uri)})"
23
+ )
24
+
25
+ self
26
+ end
27
+
28
+ def data_source
29
+ S3DataSource.new(s3_uri, combined_storage)
30
+ end
31
+
32
+ def to_s
33
+ "#<#{self.class.name}:#{s3_uri}>"
34
+ end
35
+
36
+ private
37
+
38
+ attr_reader :combined_storage
39
+
40
+ def lines
41
+ @lines ||= []
42
+ end
43
+
44
+ def contents
45
+ lines.join("\n")
46
+ end
47
+ end
48
+ end
@@ -0,0 +1,41 @@
1
+ # Copyright Swipely, Inc. All rights reserved.
2
+
3
+ require 'hasta/s3_uri'
4
+ require 'hasta/combined_storage'
5
+
6
+ module Hasta
7
+ # Data source for reading data from S3
8
+ class S3DataSource
9
+ def initialize(s3_uri, combined_storage = Hasta.combined_storage)
10
+ @s3_uri = s3_uri
11
+ @combined_storage = combined_storage
12
+ end
13
+
14
+ def name
15
+ s3_uri.to_s
16
+ end
17
+
18
+ def each_line
19
+ return enum_for(:each_line) unless block_given?
20
+
21
+ combined_storage.files_for(s3_uri).each do |file|
22
+ Hasta.logger.debug(
23
+ "Processing #{file.remote? ? 'remote' : 'local'} #{file.class}: #{file.key}"
24
+ )
25
+ file.each_line { |line| yield line }
26
+ end
27
+ end
28
+
29
+ def to_a
30
+ each_line.to_a
31
+ end
32
+
33
+ def to_s
34
+ "#<#{self.class.name}:#{s3_uri}>"
35
+ end
36
+
37
+ private
38
+
39
+ attr_reader :s3_uri, :combined_storage
40
+ end
41
+ end
@@ -0,0 +1,56 @@
1
+ # Copyright Swipely, Inc. All rights reserved.
2
+
3
+ require 'forwardable'
4
+
5
+ module Hasta
6
+ # Hasta's interface to the File objects returned by Fog
7
+ class S3File
8
+ extend Forwardable
9
+
10
+ def_delegators :s3_file, :key, :body
11
+
12
+ def self.wrap_files(s3_files)
13
+ s3_files.map { |s3_file| wrap(s3_file) }
14
+ end
15
+
16
+ def self.wrap(s3_file)
17
+ if self === s3_file
18
+ s3_file
19
+ elsif s3_file.nil?
20
+ nil
21
+ else
22
+ new(s3_file)
23
+ end
24
+ end
25
+
26
+ def initialize(s3_file)
27
+ @s3_file = s3_file
28
+ end
29
+
30
+ def s3_uri
31
+ @s3_uri ||= S3URI.new(s3_file.directory.key, key)
32
+ end
33
+
34
+ def fingerprint
35
+ @fingerprint ||= if s3_file.respond_to? :etag
36
+ s3_file.etag
37
+ else
38
+ Digest::MD5.hexdigest(body)
39
+ end
40
+ end
41
+
42
+ def remote?
43
+ !(Fog::Storage::Local::File === s3_file)
44
+ end
45
+
46
+ def each_line
47
+ return enum_for(:each_line) unless block_given?
48
+
49
+ StringIO.new(s3_file.body).each_line { |line| yield line }
50
+ end
51
+
52
+ private
53
+
54
+ attr_reader :s3_file
55
+ end
56
+ end