wonderdog 0.0.1 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
data/Gemfile ADDED
@@ -0,0 +1,8 @@
1
+ source :rubygems
2
+
3
+ gemspec
4
+
5
+ group :development do
6
+ gem 'rake', '~> 0.9'
7
+ gem 'rspec', '~> 2'
8
+ end
@@ -0,0 +1,57 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ wonderdog (0.0.1)
5
+ wukong (= 3.0.0.pre3)
6
+ wukong-hadoop (>= 0.0.2)
7
+
8
+ GEM
9
+ remote: http://rubygems.org/
10
+ specs:
11
+ configliere (0.4.18)
12
+ highline (>= 1.5.2)
13
+ multi_json (>= 1.1)
14
+ diff-lcs (1.1.3)
15
+ eventmachine (1.0.0)
16
+ forgery (0.5.0)
17
+ gorillib (0.4.2)
18
+ configliere (>= 0.4.13)
19
+ json
20
+ multi_json (>= 1.1)
21
+ highline (1.6.15)
22
+ json (1.7.5)
23
+ log4r (1.1.10)
24
+ multi_json (1.5.0)
25
+ rake (0.9.6)
26
+ rspec (2.12.0)
27
+ rspec-core (~> 2.12.0)
28
+ rspec-expectations (~> 2.12.0)
29
+ rspec-mocks (~> 2.12.0)
30
+ rspec-core (2.12.2)
31
+ rspec-expectations (2.12.1)
32
+ diff-lcs (~> 1.1.3)
33
+ rspec-mocks (2.12.0)
34
+ uuidtools (2.1.3)
35
+ vayacondios-client (0.1.2)
36
+ configliere (>= 0.4.16)
37
+ gorillib (~> 0.4.2)
38
+ multi_json (~> 1.1)
39
+ wukong (3.0.0.pre3)
40
+ configliere (>= 0.4.18)
41
+ eventmachine
42
+ forgery
43
+ gorillib (>= 0.4.2)
44
+ log4r
45
+ multi_json (>= 1.3.6)
46
+ uuidtools
47
+ vayacondios-client (>= 0.1.2)
48
+ wukong-hadoop (0.0.2)
49
+ wukong (= 3.0.0.pre3)
50
+
51
+ PLATFORMS
52
+ ruby
53
+
54
+ DEPENDENCIES
55
+ rake (~> 0.9)
56
+ rspec (~> 2)
57
+ wonderdog!
@@ -8,6 +8,7 @@ module Wukong
8
8
  # @return [Configliere::Param] the newly configured settings
9
9
  def self.configure settings
10
10
  settings.define(:es_tmp_dir, :description => "Temporary directory on the HDFS to store job files while reading/writing to ElasticSearch", :default => "/user/#{ENV['USER']}/wukong", :wukong_hadoop => true)
11
+ settings.define(:es_lib_dir, :description => "Directory containing Elasticsearch, Wonderdog, and other support jars", :default => "/usr/lib/hadoop/lib", :wukong_hadoop => true)
11
12
  settings.define(:es_config, :description => "Where to find configuration files detailing how to join an ElasticSearch cluster", :wukong_hadoop => true)
12
13
  settings.define(:es_input_splits, :description => "Number of input splits to target when reading from ElasticSearch", :type => Integer, :wukong_hadoop => true)
13
14
  settings.define(:es_request_size, :description => "Number of objects requested during each batch read from ElasticSearch", :type => Integer, :wukong_hadoop => true)
@@ -98,8 +98,15 @@ module Wukong
98
98
  #
99
99
  # @return [Array<String>]
100
100
  def hadoop_jobconf_options
101
+ if reads_from_elasticsearch? || writes_to_elasticsearch?
102
+ settings[:map_speculative] = false if settings[:map_speculative].nil?
103
+ settings[:reduce_speculative] = false if settings[:reduce_speculative].nil?
104
+ end
105
+
101
106
  super() + [].tap do |o|
102
- o << java_opt('es.config', settings[:es_config]) if (reads_from_elasticsearch? || writes_to_elasticsearch?)
107
+ if (reads_from_elasticsearch? || writes_to_elasticsearch?)
108
+ o << java_opt('es.config', settings[:es_config])
109
+ end
103
110
 
104
111
  if reads_from_elasticsearch?
105
112
  o << java_opt('elasticsearch.input.index', input_index.index)
@@ -121,6 +128,28 @@ module Wukong
121
128
  end.flatten.compact
122
129
  end
123
130
 
131
+ # :nodoc:
132
+ #
133
+ # Munge the settings object to add necessary jars if
134
+ # reading/writing to/from Elasticsearch, then call super().
135
+ def hadoop_files
136
+ if reads_from_elasticsearch? || writes_to_elasticsearch?
137
+ settings[:jars] = elasticsearch_jars if settings[:jars].empty?
138
+ end
139
+ super()
140
+ end
141
+
142
+ # All Elasticsearch, Wonderdog, and other support jars needed to
143
+ # connect Hadoop streaming with the
144
+ # ElasticSearchStreamingInputFormat and
145
+ # ElasticSearchStreamingOutputFormat provided by the Wonderdog
146
+ # Java code.
147
+ #
148
+ # @return [Array<String>]
149
+ def elasticsearch_jars
150
+ Dir[File.join(settings[:es_lib_dir] || '/usr/lib/hadoop/lib', '{elasticsearch,lucene,jna,wonderdog}*.jar')].compact.uniq
151
+ end
152
+
124
153
  # Returns a temporary path on the HDFS in which to store log
125
154
  # data while the Hadoop job runs.
126
155
  #
@@ -129,7 +158,7 @@ module Wukong
129
158
  def elasticsearch_hdfs_tmp_dir io
130
159
  cleaner = %r{[^\w/\.\-\+]+}
131
160
  io_part = [io.index, io.mapping].compact.map { |s| s.gsub(cleaner, '') }.join('/')
132
- File.join(settings[:es_tmp_dir], io_part, Time.now.strftime("%Y-%m-%d-%H-%M-%S"))
161
+ File.join(settings[:es_tmp_dir] || '/', io_part || '', Time.now.strftime("%Y-%m-%d-%H-%M-%S"))
133
162
  end
134
163
 
135
164
  end
@@ -1,3 +1,4 @@
1
1
  module Wonderdog
2
- VERSION = '0.0.1'
2
+ # The currently running Wonderdog version
3
+ VERSION = '0.0.2'
3
4
  end
@@ -7,75 +7,120 @@ describe Wukong::Elasticsearch::HadoopInvocationOverride do
7
7
  let(:es_writer) { driver('regexp', 'count', input: '/tmp/input_file', output: 'es:///the_index/the_map') }
8
8
  let(:es_complex) { driver('regexp', 'count', input: 'es://the_index/the_map', output: 'es:///the_index/the_map', es_query: '{"hi": "there"}', es_request_size: 1000, es_index_field: 'ID') }
9
9
 
10
- context "not interacting with Elasticsearch" do
11
- subject { no_es }
12
- # input
13
- its(:input_paths) { should == '/tmp/input_file' }
14
- its(:hadoop_commandline) { should match(%r{-input.*/tmp/input_file}i) }
15
-
16
- # output
17
- its(:output_path) { should == '/tmp/output_file' }
18
- its(:hadoop_commandline) { should match(%r{-output.*/tmp/output_file}i) }
19
-
20
- # no elasticsearch anything
21
- its(:hadoop_commandline) { should_not match(/elasticsearch/i) }
10
+ context "passing necessary jars to Hadoop streaming" do
11
+ before { Dir.stub!(:[]).and_return(["/lib/dir/elasticsearch.jar"], ["/lib/dir/wonderdog.jar"]) }
12
+ context "when not given explicit jars" do
13
+ context "and not interacting with Elasticsearch" do
14
+ it "doesn't add jars" do
15
+ no_es.hadoop_commandline.should_not match('-libjars')
16
+ end
17
+ end
18
+ context "and reading from Elasticsearch" do
19
+ it "adds default jars it finds on the local filesystem" do
20
+ es_reader.hadoop_commandline.should match('-libjars.*elasticsearch')
21
+ end
22
+ end
23
+ context "and writing to Elasticsearch" do
24
+ it "adds default jars it finds on the local filesystem" do
25
+ es_writer.hadoop_commandline.should match('-libjars.*elasticsearch')
26
+ end
27
+ end
28
+ context "and reading and writing to Elasticsearch" do
29
+ it "adds default jars it finds on the local filesystem" do
30
+ es_complex.hadoop_commandline.should match('-libjars.*elasticsearch')
31
+ end
32
+ end
33
+ end
22
34
  end
23
35
 
24
- context "reading from Elasticsearch" do
25
- subject { es_reader }
26
-
27
- # input
28
- its(:input_paths) { should match(%r{/user.*wukong.*the_index.*the_map}) }
29
- its(:hadoop_commandline) { should match(/-inputformat.*elasticsearch/i) }
30
- its(:hadoop_commandline) { should match(%r{-input.*/user.*wukong.*the_index.*the_map}i) }
31
- its(:hadoop_commandline) { should match(/-D\s+elasticsearch\.input\.index.*the_index/i) }
32
- its(:hadoop_commandline) { should match(/-D\s+elasticsearch\.input\.map.*the_map/i) }
33
-
34
- # output
35
- its(:output_path) { should == '/tmp/output_file' }
36
- its(:hadoop_commandline) { should_not match(/-outputformat/i) }
37
- its(:hadoop_commandline) { should match(%r{-output.*/tmp/output_file}i) }
38
- its(:hadoop_commandline) { should_not match(/-D\s+elasticsearch\.output/i) }
36
+ context "setting speculative execution" do
37
+ context "when not given speculative options" do
38
+ context "and not interacting with Elasticsearch" do
39
+ it "doesn't add jars" do
40
+ no_es.hadoop_commandline.should_not match('speculative')
41
+ end
42
+ end
43
+ context "and reading from Elasticsearch" do
44
+ it "adds default jars it finds on the local filesystem" do
45
+ es_reader.hadoop_commandline.should match('-mapred.map.tasks.speculative.execution.*false')
46
+ es_reader.hadoop_commandline.should match('-mapred.reduce.tasks.speculative.execution.*false')
47
+ end
48
+ end
49
+ end
39
50
  end
51
+
52
+ context "handling input and output paths, formats, and options when" do
40
53
 
41
- context "writing to Elasticsearch" do
42
- subject { es_writer }
54
+ context "not interacting with Elasticsearch" do
55
+ subject { no_es }
56
+ # input
57
+ its(:input_paths) { should == '/tmp/input_file' }
58
+ its(:hadoop_commandline) { should match(%r{-input.*/tmp/input_file}i) }
43
59
 
44
- # input
45
- its(:input_paths) { should == '/tmp/input_file' }
46
- its(:hadoop_commandline) { should_not match(/-inputformat/i) }
47
- its(:hadoop_commandline) { should match(%r{-input.*/tmp/input_file}i) }
48
- its(:hadoop_commandline) { should_not match(/-D\s+elasticsearch\.input/i) }
60
+ # output
61
+ its(:output_path) { should == '/tmp/output_file' }
62
+ its(:hadoop_commandline) { should match(%r{-output.*/tmp/output_file}i) }
49
63
 
50
- # output
51
- its(:output_path) { should match(%r{/user.*wukong.*the_index.*the_map}) }
52
- its(:hadoop_commandline) { should match(/-outputformat.*elasticsearch/i) }
53
- its(:hadoop_commandline) { should match(%r{-output.*/user.*wukong.*the_index.*the_map}i) }
54
- its(:hadoop_commandline) { should match(/-D\s+elasticsearch\.output\.index.*the_index/i) }
55
- its(:hadoop_commandline) { should match(/-D\s+elasticsearch\.output\.map.*the_map/i) }
56
- end
64
+ # no elasticsearch anything
65
+ its(:hadoop_commandline) { should_not match(/elasticsearch/i) }
66
+ end
67
+
68
+ context "reading from Elasticsearch" do
69
+ subject { es_reader }
70
+
71
+ # input
72
+ its(:input_paths) { should match(%r{/user.*wukong.*the_index.*the_map}) }
73
+ its(:hadoop_commandline) { should match(/-inputformat.*elasticsearch/i) }
74
+ its(:hadoop_commandline) { should match(%r{-input.*/user.*wukong.*the_index.*the_map}i) }
75
+ its(:hadoop_commandline) { should match(/-D\s+elasticsearch\.input\.index.*the_index/i) }
76
+ its(:hadoop_commandline) { should match(/-D\s+elasticsearch\.input\.map.*the_map/i) }
77
+
78
+ # output
79
+ its(:output_path) { should == '/tmp/output_file' }
80
+ its(:hadoop_commandline) { should_not match(/-outputformat/i) }
81
+ its(:hadoop_commandline) { should match(%r{-output.*/tmp/output_file}i) }
82
+ its(:hadoop_commandline) { should_not match(/-D\s+elasticsearch\.output/i) }
83
+ end
84
+
85
+ context "writing to Elasticsearch" do
86
+ subject { es_writer }
87
+
88
+ # input
89
+ its(:input_paths) { should == '/tmp/input_file' }
90
+ its(:hadoop_commandline) { should_not match(/-inputformat/i) }
91
+ its(:hadoop_commandline) { should match(%r{-input.*/tmp/input_file}i) }
92
+ its(:hadoop_commandline) { should_not match(/-D\s+elasticsearch\.input/i) }
93
+
94
+ # output
95
+ its(:output_path) { should match(%r{/user.*wukong.*the_index.*the_map}) }
96
+ its(:hadoop_commandline) { should match(/-outputformat.*elasticsearch/i) }
97
+ its(:hadoop_commandline) { should match(%r{-output.*/user.*wukong.*the_index.*the_map}i) }
98
+ its(:hadoop_commandline) { should match(/-D\s+elasticsearch\.output\.index.*the_index/i) }
99
+ its(:hadoop_commandline) { should match(/-D\s+elasticsearch\.output\.map.*the_map/i) }
100
+ end
57
101
 
58
- context "reading and writing with many options" do
59
- subject { es_complex }
102
+ context "reading and writing with many options" do
103
+ subject { es_complex }
60
104
 
61
- # input
62
- its(:input_paths) { should match(%r{/user.*wukong.*the_index.*the_map}) }
63
- its(:hadoop_commandline) { should match(/-inputformat.*elasticsearch/i) }
64
- its(:hadoop_commandline) { should match(%r{-input.*/user.*wukong.*the_index.*the_map}i) }
65
- its(:hadoop_commandline) { should match(/-D\s+elasticsearch\.input\.index.*the_index/i) }
66
- its(:hadoop_commandline) { should match(/-D\s+elasticsearch\.input\.map.*the_map/i) }
105
+ # input
106
+ its(:input_paths) { should match(%r{/user.*wukong.*the_index.*the_map}) }
107
+ its(:hadoop_commandline) { should match(/-inputformat.*elasticsearch/i) }
108
+ its(:hadoop_commandline) { should match(%r{-input.*/user.*wukong.*the_index.*the_map}i) }
109
+ its(:hadoop_commandline) { should match(/-D\s+elasticsearch\.input\.index.*the_index/i) }
110
+ its(:hadoop_commandline) { should match(/-D\s+elasticsearch\.input\.map.*the_map/i) }
67
111
 
68
- # output
69
- its(:output_path) { should match(%r{/user.*wukong.*the_index.*the_map}) }
70
- its(:hadoop_commandline) { should match(/-outputformat.*elasticsearch/i) }
71
- its(:hadoop_commandline) { should match(%r{-output.*/user.*wukong.*the_index.*the_map}i) }
72
- its(:hadoop_commandline) { should match(/-D\s+elasticsearch\.output\.index.*the_index/i) }
73
- its(:hadoop_commandline) { should match(/-D\s+elasticsearch\.output\.map.*the_map/i) }
112
+ # output
113
+ its(:output_path) { should match(%r{/user.*wukong.*the_index.*the_map}) }
114
+ its(:hadoop_commandline) { should match(/-outputformat.*elasticsearch/i) }
115
+ its(:hadoop_commandline) { should match(%r{-output.*/user.*wukong.*the_index.*the_map}i) }
116
+ its(:hadoop_commandline) { should match(/-D\s+elasticsearch\.output\.index.*the_index/i) }
117
+ its(:hadoop_commandline) { should match(/-D\s+elasticsearch\.output\.map.*the_map/i) }
74
118
 
75
- # options
76
- its(:hadoop_commandline) { should match(/-D\s+elasticsearch\.input\.query.*hi.*there/i) }
77
- its(:hadoop_commandline) { should match(/-D\s+elasticsearch\.input\.request_size.*1000/i) }
78
- its(:hadoop_commandline) { should match(/-D\s+elasticsearch\.output\.index\.field.*ID/i) }
119
+ # options
120
+ its(:hadoop_commandline) { should match(/-D\s+elasticsearch\.input\.query.*hi.*there/i) }
121
+ its(:hadoop_commandline) { should match(/-D\s+elasticsearch\.input\.request_size.*1000/i) }
122
+ its(:hadoop_commandline) { should match(/-D\s+elasticsearch\.output\.index\.field.*ID/i) }
123
+ end
79
124
  end
80
125
 
81
126
  end
@@ -48,7 +48,7 @@ public class ElasticSearchStreamingOutputFormat<K, V> implements OutputFormat<K,
48
48
  private String idFieldName;
49
49
 
50
50
  private static final String ES_BULK_SIZE_OPT = "elasticsearch.output.bulk_size";
51
- private static final String ES_BULK_SIZE = "100";
51
+ private static final String ES_BULK_SIZE = "1000";
52
52
  private int bulkSize;
53
53
 
54
54
 
@@ -28,5 +28,6 @@ EOF
28
28
  gem.test_files = gem.files.grep(/^spec/)
29
29
  gem.require_paths = ['lib']
30
30
 
31
- gem.add_dependency('wukong', '3.0.0.pre2')
31
+ gem.add_dependency('wukong', '3.0.0.pre3')
32
+ gem.add_dependency('wukong-hadoop', '>= 0.0.2')
32
33
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wonderdog
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -13,7 +13,7 @@ authors:
13
13
  autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
- date: 2012-12-01 00:00:00.000000000 Z
16
+ date: 2012-12-17 00:00:00.000000000 Z
17
17
  dependencies:
18
18
  - !ruby/object:Gem::Dependency
19
19
  name: wukong
@@ -22,7 +22,7 @@ dependencies:
22
22
  requirements:
23
23
  - - '='
24
24
  - !ruby/object:Gem::Version
25
- version: 3.0.0.pre2
25
+ version: 3.0.0.pre3
26
26
  type: :runtime
27
27
  prerelease: false
28
28
  version_requirements: !ruby/object:Gem::Requirement
@@ -30,7 +30,23 @@ dependencies:
30
30
  requirements:
31
31
  - - '='
32
32
  - !ruby/object:Gem::Version
33
- version: 3.0.0.pre2
33
+ version: 3.0.0.pre3
34
+ - !ruby/object:Gem::Dependency
35
+ name: wukong-hadoop
36
+ requirement: !ruby/object:Gem::Requirement
37
+ none: false
38
+ requirements:
39
+ - - ! '>='
40
+ - !ruby/object:Gem::Version
41
+ version: 0.0.2
42
+ type: :runtime
43
+ prerelease: false
44
+ version_requirements: !ruby/object:Gem::Requirement
45
+ none: false
46
+ requirements:
47
+ - - ! '>='
48
+ - !ruby/object:Gem::Version
49
+ version: 0.0.2
34
50
  description: ! " Wonderdog provides code in both Ruby and Java to make Elasticsearch\n
35
51
  \ a more fully-fledged member of both the Hadoop and Wukong\n ecosystems.\n\n For
36
52
  the Java side, Wonderdog provides InputFormat and OutputFormat\n classes for use
@@ -45,6 +61,8 @@ files:
45
61
  - .gitignore
46
62
  - .rspec
47
63
  - CHANGELOG.md
64
+ - Gemfile
65
+ - Gemfile.lock
48
66
  - LICENSE.md
49
67
  - README.md
50
68
  - Rakefile