hasta 0.1.0 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: adeafa9f4dae6acb9dd59dc39432d4e4b3d86b79
4
- data.tar.gz: 3f15a082f3c72d9a75df32e728cff7cc0ff88787
3
+ metadata.gz: 181a02eb6d37332af6efde332af20305f692f3e0
4
+ data.tar.gz: b484f8ad5a8884b7f13e03caee7277f6b00d521b
5
5
  SHA512:
6
- metadata.gz: 0e0dc797ebb1acab0781324a47ed6184f51797a1b3893da9ac069024dc9cfadd56f6dcb5c1338855e59c1c0a82745c601102fd62e4de58a198ed278332ac5f24
7
- data.tar.gz: 84a5056c2c069512a99eb8b4d9b58c9dcb0827732ffe555f7708705da0579706be7659abbfc6f4b97e4e458b02e01d2a25c38489c240e0a166d1e2b316d87132
6
+ metadata.gz: a2206da5b8b905a66037ac9c23e7faab4c4a3254ef127988620bbe36eb6a04273677d8cf42785dacb889c0b7b35c9636549462a80ca6849f2b576839141b9756
7
+ data.tar.gz: c7d96b9a9d4383e25156cad5ef58250b9713881eb70251b83347724264f08ef7adbe8ecdbc261f17c776bb12d6106466beb167a1284736fd8a0147a79d2bef25
data/.travis.yml ADDED
@@ -0,0 +1,6 @@
1
+ language: ruby
2
+ rvm:
3
+ - 1.8.7
4
+ - 1.9.3
5
+ - 2.0.0
6
+ script: bundle exec rake
data/README.md CHANGED
@@ -1,4 +1,5 @@
1
1
  # Hasta
2
+ [![travis-ci](https://travis-ci.org/swipely/hasta.png?branch=master)](https://travis-ci.org/swipely/hasta)
2
3
 
3
4
  <b>HA</b>doop <b>S</b>treaming <b>T</b>est h<b>A</b>rness for Amazon EMR
4
5
 
@@ -85,15 +86,17 @@ To control which credentials Hasta is using when communicating with S3, update y
85
86
  ## Data Filtering
86
87
 
87
88
  Hasta automatically filters input data to minimize execution time.
88
- By default, Hasta looks for a file in the current directory named `filter_config.txt` for the filtering configuration.
89
+ By default, Hasta looks for a file in the current directory named `filter_config.yml` for the filtering configuration.
89
90
  You can change the filter configuration file by setting the `HASTA_DATA_FILTER_FILE` environment variable.
90
91
  If you want to disable data filtering, you can set the `HASTA_DATA_FILTERING` environment variable to `OFF`.
91
92
 
92
93
  ### Filter Configuration
93
94
 
94
- The filter configuration file contains regular expressions, one per line.
95
- Any line of input data that matches at least one of the regular expressions is included in the test input.
96
- Lines that do no match any of the regular expressions are excluded from the test input.
95
+ The filter configuration file is a YAML file containing a Hash that maps S3 URIs (as Strings) to Arrays of regular expressions (also as Strings)
96
+ Any line of input data that comes from an S3 URI whose prefix matches one of the S3 URIs in the filter configuration that matches at least one of the regular expressions is included in the test input.
97
+ Any line of input data that comes from an S3 URI whose prefix matches one of the S3 URIs in the filter configuration that does not match any of the regular expressions is excluded from the test input.
98
+ Input data that does not come from an S3 URI whose prefix matches on of the S3 URIs in the filter configuration is not filtered.
99
+ If an input S3 URI matches multiple S3 URIs in the filter configuration, the most specific match is the one that is chosen for filtering purposes.
97
100
 
98
101
  ### Caching
99
102
 
@@ -14,7 +14,7 @@ module Hasta
14
14
  # Global configuration settings
15
15
  class Configuration
16
16
  attr_accessor :project_root
17
- attr_writer :local_storage_root, :cache_storage_root, :project_steps, :logger, :filter
17
+ attr_writer :local_storage_root, :cache_storage_root, :project_steps, :logger, :filters
18
18
 
19
19
  def local_storage_root
20
20
  @local_storage_root ||= '~/fog'
@@ -43,13 +43,13 @@ module Hasta
43
43
  )
44
44
  end
45
45
 
46
- def filter
47
- unless @filter || ENV['HASTA_DATA_FILTERING'] == 'OFF'
48
- filter_file = ENV['HASTA_DATA_FILTER_FILE'] || 'filter_config.txt'
49
- @filter ||= Filter.from_file(filter_file)
46
+ def filters
47
+ unless @filters || ENV['HASTA_DATA_FILTERING'] == 'OFF'
48
+ filter_file = ENV['HASTA_DATA_FILTER_FILE'] || 'filter_config.yml'
49
+ @filters ||= Filters.from_file(filter_file)
50
50
  end
51
51
 
52
- @filter
52
+ @filters
53
53
  end
54
54
 
55
55
  private
@@ -76,9 +76,9 @@ module Hasta
76
76
  end
77
77
 
78
78
  def resolver
79
- if filter
79
+ if filters
80
80
  ResolveCachedS3File.new(
81
- S3FileCache.new(fog_cache_storage), ResolveFilteredS3File.new(filter)
81
+ S3FileCache.new(fog_cache_storage), ResolveFilteredS3File.new(filters)
82
82
  )
83
83
  else
84
84
  Hasta::Storage::ResolveS3File
@@ -0,0 +1,34 @@
1
+ # Copyright Swipely, Inc. All rights reserved.
2
+
3
+ require 'yaml'
4
+
5
+ require 'hasta/s3_uri'
6
+
7
+ module Hasta
8
+ # Defines filters for different S3 path prefixes
9
+ class Filters
10
+ def self.from_file(file)
11
+ Hasta.logger.debug "Loading data filter file: #{File.expand_path(file)}"
12
+ new(YAML.load_file(file))
13
+ rescue => ex
14
+ raise ConfigurationError.new,
15
+ "Failed to load filter configuration file: #{file} - #{ex.message}"
16
+ end
17
+
18
+ def initialize(filters)
19
+ @filters = filters.map { |s3_uri, regexes|
20
+ [S3URI.parse(s3_uri), Filter.new(*regexes.map { |regex| Regexp.new(regex) }) ]
21
+ }.sort_by { |s3_uri, regexes| s3_uri.depth }.reverse
22
+ end
23
+
24
+ def for_s3_uri(target_s3_uri)
25
+ if match = filters.find { |s3_uri, filter| target_s3_uri.start_with?(s3_uri) }
26
+ match[1]
27
+ end
28
+ end
29
+
30
+ private
31
+
32
+ attr_reader :filters
33
+ end
34
+ end
@@ -1,22 +1,27 @@
1
1
  # Copyright Swipely, Inc. All rights reserved.
2
2
 
3
+ require 'hasta/filters'
3
4
  require 'hasta/filtered_s3_file'
4
5
 
5
6
  module Hasta
6
7
  # Creates a Hasta filtered S3 file instance given a Fog file
7
8
  class ResolveFilteredS3File
8
- def initialize(filter, child_resolver = Hasta::Storage::ResolveS3File)
9
- @filter = filter
9
+ def initialize(filters, child_resolver = Hasta::Storage::ResolveS3File)
10
+ @filters = filters
10
11
  @child_resolver = child_resolver
11
12
  end
12
13
 
13
14
  def resolve(fog_file)
14
- FilteredS3File.new(child_resolver.resolve(fog_file), filter)
15
+ s3_file = child_resolver.resolve(fog_file)
16
+ if filter = filters.for_s3_uri(s3_file.s3_uri)
17
+ FilteredS3File.new(s3_file, filter)
18
+ else
19
+ s3_file
20
+ end
15
21
  end
16
22
 
17
-
18
23
  private
19
24
 
20
- attr_reader :filter, :child_resolver
25
+ attr_reader :filters, :child_resolver
21
26
  end
22
27
  end
data/lib/hasta/s3_uri.rb CHANGED
@@ -20,7 +20,7 @@ module Hasta
20
20
  end
21
21
 
22
22
  def directory?
23
- path.end_with?('/')
23
+ path.nil? || path.end_with?('/')
24
24
  end
25
25
 
26
26
  def file?
@@ -35,6 +35,24 @@ module Hasta
35
35
  end
36
36
  end
37
37
 
38
+ def depth
39
+ slashes = (path && path.chars.count { |ch| ch == '/' }) || 0
40
+ if path.nil?
41
+ 1
42
+ elsif directory?
43
+ 1 + slashes
44
+ else
45
+ 2 + slashes
46
+ end
47
+ end
48
+
49
+ def start_with?(s3_uri)
50
+ return true if self == s3_uri
51
+ return false if s3_uri.file?
52
+
53
+ (bucket == s3_uri.bucket) && (s3_uri.path.nil? || path.start_with?(s3_uri.path))
54
+ end
55
+
38
56
  def parent
39
57
  if path.nil?
40
58
  nil
data/lib/hasta/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Hasta
2
- VERSION = "0.1.0"
2
+ VERSION = "0.1.1"
3
3
  end
@@ -0,0 +1,3 @@
1
+ ---
2
+ s3://my-bucket:
3
+ - \AOnly Allow This Line\z
@@ -5,25 +5,19 @@ require 'spec_helper'
5
5
  require 'hasta/in_memory_data_source'
6
6
 
7
7
  describe Hasta::Configuration do
8
- describe '#filter' do
8
+ describe '#filters' do
9
9
  subject { described_class.new }
10
10
 
11
11
  context 'given a custom path is specified' do
12
12
  before do
13
- ENV['HASTA_DATA_FILTER_FILE'] = 'spec/fixtures/hasta/filter_config.txt'
13
+ ENV['HASTA_DATA_FILTER_FILE'] = 'spec/fixtures/hasta/filter_config.yml'
14
14
  end
15
15
 
16
16
  after do
17
17
  ENV.delete('HASTA_DATA_FILTER_FILE')
18
18
  end
19
19
 
20
- let(:allowed_line) { 'Only Allow This Line' }
21
- let(:disallowed_line) { 'Do Not Allow Any Other Lines' }
22
- let(:filter_proc) { subject.filter.to_proc }
23
-
24
- it { expect(subject.filter).to_not be_nil }
25
- it { expect(filter_proc.call(allowed_line)).to be_true }
26
- it { expect(filter_proc.call(disallowed_line)).to_not be_true }
20
+ it { expect(subject.filters).to_not be_nil }
27
21
 
28
22
  context 'given filtering is disabled' do
29
23
  before do
@@ -41,7 +35,7 @@ describe Hasta::Configuration do
41
35
  let(:local_storage_root) { Dir.mktmpdir('config_test_local_dir') }
42
36
  let(:local_s3_uri) { Hasta::S3URI.new('my-bucket', 'path/to/my/file.txt') }
43
37
 
44
- it { expect(subject.filter).to be_nil }
38
+ it { expect(subject.filters).to be_nil }
45
39
  it { expect(subject.combined_storage.files_for(local_s3_uri)).to_not be_empty }
46
40
  end
47
41
  end
@@ -6,13 +6,13 @@ describe Hasta::EmrJobDefinition do
6
6
  shared_context 'with a filter defined' do
7
7
  before do
8
8
  Hasta.configure do |config|
9
- config.filter = Hasta::Filter.new(/.*/)
9
+ config.filters = Hasta::Filters.new({ 's3://my-bucket' => ['.*'] })
10
10
  end
11
11
  end
12
12
 
13
13
  after do
14
14
  Hasta.configure do |config|
15
- config.filter = nil
15
+ config.filters = nil
16
16
  end
17
17
  end
18
18
  end
@@ -0,0 +1,55 @@
1
+ # Copyright Swipely, Inc. All rights reserved.
2
+
3
+ require 'spec_helper'
4
+
5
+ require 'hasta/filters'
6
+
7
+ describe Hasta::Filters do
8
+ describe '.from_file' do
9
+ let(:non_existent_file) { 'spec/fixtures/hasta/non_existent_file.dll' }
10
+
11
+ context 'given an non-existent file' do
12
+ it 'raises a ConfigurationError' do
13
+ expect {
14
+ described_class.from_file(non_existent_file)
15
+ }.to raise_error(Hasta::ConfigurationError)
16
+ end
17
+ end
18
+ end
19
+
20
+ describe '#for_s3_uri' do
21
+ subject { described_class.new(filters) }
22
+
23
+ let(:filters) {
24
+ {
25
+ 's3://my-bucket/path1/' => ['.*'],
26
+ 's3://my-bucket/path/to/file.txt' => ['[a-z]{2}', '\A_.*'],
27
+ 's3://my-bucket/path1/path2/' => ['z.+', 'a.+', 'x.?'],
28
+ }
29
+ }
30
+
31
+ context 'given no match' do
32
+ let(:s3_uri) { Hasta::S3URI.new('other-bucket', 'path/to/file.txt') }
33
+
34
+ it 'returns nil' do
35
+ expect(subject.for_s3_uri(s3_uri)).to be_nil
36
+ end
37
+ end
38
+
39
+ context 'given a single match' do
40
+ let(:s3_uri) { Hasta::S3URI.new('my-bucket', 'path/to/file.txt') }
41
+
42
+ it 'selects the only match' do
43
+ expect(subject.for_s3_uri(s3_uri).to_s).to eq('#<Hasta::Filter:[/[a-z]{2}/, /\\A_.*/]>')
44
+ end
45
+ end
46
+
47
+ context 'given multiple matches' do
48
+ let(:s3_uri) { Hasta::S3URI.new('my-bucket', 'path1/path2/file3.txt') }
49
+
50
+ it 'selects the most specific match' do
51
+ expect(subject.for_s3_uri(s3_uri).to_s).to eq('#<Hasta::Filter:[/a.+/, /x.?/, /z.+/]>')
52
+ end
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,30 @@
1
+ # Copyright Swipely, Inc. All rights reserved.
2
+
3
+ require 'spec_helper'
4
+
5
+ describe Hasta::ResolveFilteredS3File do
6
+ describe '#resolve' do
7
+ subject { described_class.new(filters) }
8
+
9
+ let(:filters) { Hasta::Filters.new('s3://my-bucket/path/to/my/files/' => ['\A\d{1,3}.*']) }
10
+ let(:fog_file) {
11
+ double('Fog::File',
12
+ :directory => double('Fog::Directory', :key => bucket_name),
13
+ :key => path
14
+ )
15
+ }
16
+ let(:bucket_name) { 'my-bucket' }
17
+
18
+ context 'given a filtered file' do
19
+ let(:path) { 'path/to/my/files/1.txt' }
20
+
21
+ it { expect(subject.resolve(fog_file)).to be_kind_of(Hasta::FilteredS3File) }
22
+ end
23
+
24
+ context 'given a non-filtered file' do
25
+ let(:path) { 'path/to/your/files/1.txt' }
26
+
27
+ it { expect(subject.resolve(fog_file)).to be_kind_of(Hasta::S3File) }
28
+ end
29
+ end
30
+ end
@@ -9,8 +9,8 @@ describe Hasta::ResolveCachedS3File do
9
9
  subject { described_class.new(file_cache, child_resolver) }
10
10
 
11
11
  let(:file_cache) { double(Hasta::S3FileCache) }
12
- let(:child_resolver) { Hasta::ResolveFilteredS3File.new(filter) }
13
- let(:filter) { Hasta::Filter.new(/.*/) }
12
+ let(:child_resolver) { Hasta::ResolveFilteredS3File.new(filters) }
13
+ let(:filters) { Hasta::Filters.new({ "s3://#{bucket_name}" => ['.*'] }) }
14
14
 
15
15
  let(:fog_file) {
16
16
  double('Fog::File',
@@ -24,9 +24,10 @@ describe Hasta::ResolveCachedS3File do
24
24
  let(:path) { 'path/to/my/file.txt' }
25
25
  let(:body) { "Parts\n" }
26
26
  let(:s3_uri) { Hasta::S3URI.new(bucket_name, path) }
27
+ let(:exp_filter) { filters.for_s3_uri(s3_uri) }
27
28
 
28
29
  let(:exp_fingerprint) {
29
- Digest::MD5.hexdigest("#{Digest::MD5.hexdigest(body)}_#{filter.to_s}")
30
+ Digest::MD5.hexdigest("#{Digest::MD5.hexdigest(body)}_#{exp_filter.to_s}")
30
31
  }
31
32
 
32
33
  let(:result) { subject.resolve(fog_file) }
@@ -14,6 +14,7 @@ describe Hasta::S3URI do
14
14
 
15
15
  it { expect(subject.bucket).to eq(bucket) }
16
16
  it { expect(subject.path).to be_nil }
17
+ it { expect(subject.depth).to eq(1) }
17
18
  end
18
19
 
19
20
  context 'given a bucket and a file path' do
@@ -23,6 +24,7 @@ describe Hasta::S3URI do
23
24
 
24
25
  it { expect(subject.bucket).to eq(bucket) }
25
26
  it { expect(subject.path).to eq(path) }
27
+ it { expect(subject.depth).to eq(4) }
26
28
  end
27
29
 
28
30
  context 'given a bucket and a directory path' do
@@ -32,6 +34,7 @@ describe Hasta::S3URI do
32
34
 
33
35
  it { expect(subject.bucket).to eq(bucket) }
34
36
  it { expect(subject.path).to eq(path) }
37
+ it { expect(subject.depth).to eq(4) }
35
38
  end
36
39
 
37
40
  context 'given an s3n URI with a bucket and a directory path' do
@@ -41,6 +44,7 @@ describe Hasta::S3URI do
41
44
 
42
45
  it { expect(subject.bucket).to eq(bucket) }
43
46
  it { expect(subject.path).to eq(path) }
47
+ it { expect(subject.depth).to eq(4) }
44
48
  end
45
49
 
46
50
  context 'given an invalid uri' do
@@ -66,6 +70,12 @@ describe Hasta::S3URI do
66
70
 
67
71
  it { expect(subject).to_not be_file }
68
72
  end
73
+
74
+ context 'given a bucket only path' do
75
+ let(:path) { nil }
76
+
77
+ it { expect(subject).to_not be_file }
78
+ end
69
79
  end
70
80
 
71
81
  describe '#basename' do
@@ -148,4 +158,24 @@ describe Hasta::S3URI do
148
158
  it { expect(subject.parent).to eq(described_class.new(bucket, 'path/to/my/')) }
149
159
  end
150
160
  end
161
+
162
+ describe '#start_with?' do
163
+ subject { described_class.new(bucket, path) }
164
+
165
+ let(:bucket) { 'my-bucket' }
166
+ let(:path) { 'path/to/my/favorite/file.txt' }
167
+
168
+ let(:other_bucket_uri) { described_class.new('some-other-bucket', path) }
169
+ let(:other_path_uri) { described_class.new(bucket, 'path/to/your/favorite/file.txt') }
170
+ let(:similar_prefix_uri) { described_class.new(bucket, 'path/to/my/fav') }
171
+ let(:bucket_only_uri) { described_class.new(bucket, nil) }
172
+
173
+ it { expect(subject.start_with?(subject)).to be_true }
174
+ it { expect(subject.start_with?(subject.parent)).to be_true }
175
+ it { expect(subject.start_with?(subject.parent.parent)).to be_true }
176
+ it { expect(subject.start_with?(other_bucket_uri)).to be_false }
177
+ it { expect(subject.start_with?(other_path_uri)).to be_false }
178
+ it { expect(subject.start_with?(similar_prefix_uri)).to be_false }
179
+ it { expect(subject.start_with?(bucket_only_uri)).to be_true }
180
+ end
151
181
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: hasta
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - danhodge
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2014-04-09 00:00:00.000000000 Z
11
+ date: 2014-04-11 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: fog
@@ -118,6 +118,7 @@ files:
118
118
  - .cane
119
119
  - .gitignore
120
120
  - .rspec
121
+ - .travis.yml
121
122
  - Gemfile
122
123
  - LICENSE.txt
123
124
  - README.md
@@ -134,6 +135,7 @@ files:
134
135
  - lib/hasta/execution_context.rb
135
136
  - lib/hasta/filter.rb
136
137
  - lib/hasta/filtered_s3_file.rb
138
+ - lib/hasta/filters.rb
137
139
  - lib/hasta/identity_mapper.rb
138
140
  - lib/hasta/identity_reducer.rb
139
141
  - lib/hasta/in_memory_data_sink.rb
@@ -157,7 +159,7 @@ files:
157
159
  - lib/hasta/tasks.rb
158
160
  - lib/hasta/tasks/runner.rb
159
161
  - lib/hasta/version.rb
160
- - spec/fixtures/hasta/filter_config.txt
162
+ - spec/fixtures/hasta/filter_config.yml
161
163
  - spec/fixtures/hasta/json/emr_node.json
162
164
  - spec/fixtures/hasta/json/pipeline_definition.json
163
165
  - spec/fixtures/hasta/lib/failing_mapper.rb
@@ -175,6 +177,7 @@ files:
175
177
  - spec/hasta/execution_context_spec.rb
176
178
  - spec/hasta/filter_spec.rb
177
179
  - spec/hasta/filtered_s3_file_spec.rb
180
+ - spec/hasta/filters_spec.rb
178
181
  - spec/hasta/identity_mapper_spec.rb
179
182
  - spec/hasta/identity_reducer_spec.rb
180
183
  - spec/hasta/interpolate_string_spec.rb
@@ -182,6 +185,7 @@ files:
182
185
  - spec/hasta/local_storage_spec.rb
183
186
  - spec/hasta/mapper_spec.rb
184
187
  - spec/hasta/reducer_spec.rb
188
+ - spec/hasta/resolve_filtered_s3_file_spec.rb
185
189
  - spec/hasta/resolved_cached_s3_file_spec.rb
186
190
  - spec/hasta/s3_data_source_spec.rb
187
191
  - spec/hasta/s3_file_cache_spec.rb
@@ -217,7 +221,7 @@ signing_key:
217
221
  specification_version: 4
218
222
  summary: HAdoop Streaming Test hArness
219
223
  test_files:
220
- - spec/fixtures/hasta/filter_config.txt
224
+ - spec/fixtures/hasta/filter_config.yml
221
225
  - spec/fixtures/hasta/json/emr_node.json
222
226
  - spec/fixtures/hasta/json/pipeline_definition.json
223
227
  - spec/fixtures/hasta/lib/failing_mapper.rb
@@ -235,6 +239,7 @@ test_files:
235
239
  - spec/hasta/execution_context_spec.rb
236
240
  - spec/hasta/filter_spec.rb
237
241
  - spec/hasta/filtered_s3_file_spec.rb
242
+ - spec/hasta/filters_spec.rb
238
243
  - spec/hasta/identity_mapper_spec.rb
239
244
  - spec/hasta/identity_reducer_spec.rb
240
245
  - spec/hasta/interpolate_string_spec.rb
@@ -242,6 +247,7 @@ test_files:
242
247
  - spec/hasta/local_storage_spec.rb
243
248
  - spec/hasta/mapper_spec.rb
244
249
  - spec/hasta/reducer_spec.rb
250
+ - spec/hasta/resolve_filtered_s3_file_spec.rb
245
251
  - spec/hasta/resolved_cached_s3_file_spec.rb
246
252
  - spec/hasta/s3_data_source_spec.rb
247
253
  - spec/hasta/s3_file_cache_spec.rb
@@ -1 +0,0 @@
1
- \AOnly Allow This Line\z