wonderdog 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Gemfile ADDED
@@ -0,0 +1,8 @@
1
+ source :rubygems
2
+
3
+ gemspec
4
+
5
+ group :development do
6
+ gem 'rake', '~> 0.9'
7
+ gem 'rspec', '~> 2'
8
+ end
@@ -0,0 +1,57 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ wonderdog (0.0.1)
5
+ wukong (= 3.0.0.pre3)
6
+ wukong-hadoop (>= 0.0.2)
7
+
8
+ GEM
9
+ remote: http://rubygems.org/
10
+ specs:
11
+ configliere (0.4.18)
12
+ highline (>= 1.5.2)
13
+ multi_json (>= 1.1)
14
+ diff-lcs (1.1.3)
15
+ eventmachine (1.0.0)
16
+ forgery (0.5.0)
17
+ gorillib (0.4.2)
18
+ configliere (>= 0.4.13)
19
+ json
20
+ multi_json (>= 1.1)
21
+ highline (1.6.15)
22
+ json (1.7.5)
23
+ log4r (1.1.10)
24
+ multi_json (1.5.0)
25
+ rake (0.9.6)
26
+ rspec (2.12.0)
27
+ rspec-core (~> 2.12.0)
28
+ rspec-expectations (~> 2.12.0)
29
+ rspec-mocks (~> 2.12.0)
30
+ rspec-core (2.12.2)
31
+ rspec-expectations (2.12.1)
32
+ diff-lcs (~> 1.1.3)
33
+ rspec-mocks (2.12.0)
34
+ uuidtools (2.1.3)
35
+ vayacondios-client (0.1.2)
36
+ configliere (>= 0.4.16)
37
+ gorillib (~> 0.4.2)
38
+ multi_json (~> 1.1)
39
+ wukong (3.0.0.pre3)
40
+ configliere (>= 0.4.18)
41
+ eventmachine
42
+ forgery
43
+ gorillib (>= 0.4.2)
44
+ log4r
45
+ multi_json (>= 1.3.6)
46
+ uuidtools
47
+ vayacondios-client (>= 0.1.2)
48
+ wukong-hadoop (0.0.2)
49
+ wukong (= 3.0.0.pre3)
50
+
51
+ PLATFORMS
52
+ ruby
53
+
54
+ DEPENDENCIES
55
+ rake (~> 0.9)
56
+ rspec (~> 2)
57
+ wonderdog!
@@ -8,6 +8,7 @@ module Wukong
8
8
  # @return [Configliere::Param] the newly configured settings
9
9
  def self.configure settings
10
10
  settings.define(:es_tmp_dir, :description => "Temporary directory on the HDFS to store job files while reading/writing to ElasticSearch", :default => "/user/#{ENV['USER']}/wukong", :wukong_hadoop => true)
11
+ settings.define(:es_lib_dir, :description => "Directory containing Elasticsearch, Wonderdog, and other support jars", :default => "/usr/lib/hadoop/lib", :wukong_hadoop => true)
11
12
  settings.define(:es_config, :description => "Where to find configuration files detailing how to join an ElasticSearch cluster", :wukong_hadoop => true)
12
13
  settings.define(:es_input_splits, :description => "Number of input splits to target when reading from ElasticSearch", :type => Integer, :wukong_hadoop => true)
13
14
  settings.define(:es_request_size, :description => "Number of objects requested during each batch read from ElasticSearch", :type => Integer, :wukong_hadoop => true)
@@ -98,8 +98,15 @@ module Wukong
98
98
  #
99
99
  # @return [Array<String>]
100
100
  def hadoop_jobconf_options
101
+ if reads_from_elasticsearch? || writes_to_elasticsearch?
102
+ settings[:map_speculative] = false if settings[:map_speculative].nil?
103
+ settings[:reduce_speculative] = false if settings[:reduce_speculative].nil?
104
+ end
105
+
101
106
  super() + [].tap do |o|
102
- o << java_opt('es.config', settings[:es_config]) if (reads_from_elasticsearch? || writes_to_elasticsearch?)
107
+ if (reads_from_elasticsearch? || writes_to_elasticsearch?)
108
+ o << java_opt('es.config', settings[:es_config])
109
+ end
103
110
 
104
111
  if reads_from_elasticsearch?
105
112
  o << java_opt('elasticsearch.input.index', input_index.index)
@@ -121,6 +128,28 @@ module Wukong
121
128
  end.flatten.compact
122
129
  end
123
130
 
131
+ # :nodoc:
132
+ #
133
+ # Munge the settings object to add necessary jars if
134
+ # reading/writing to/from Elasticsearch, then call super().
135
+ def hadoop_files
136
+ if reads_from_elasticsearch? || writes_to_elasticsearch?
137
+ settings[:jars] = elasticsearch_jars if settings[:jars].empty?
138
+ end
139
+ super()
140
+ end
141
+
142
+ # All Elasticsearch, Wonderdog, and other support jars needed to
143
+ # connect Hadoop streaming with the
144
+ # ElasticSearchStreamingInputFormat and
145
+ # ElasticSearchStreamingOutputFormat provided by the Wonderdog
146
+ # Java code.
147
+ #
148
+ # @return [Array<String>]
149
+ def elasticsearch_jars
150
+ Dir[File.join(settings[:es_lib_dir] || '/usr/lib/hadoop/lib', '{elasticsearch,lucene,jna,wonderdog}*.jar')].compact.uniq
151
+ end
152
+
124
153
  # Returns a temporary path on the HDFS in which to store log
125
154
  # data while the Hadoop job runs.
126
155
  #
@@ -129,7 +158,7 @@ module Wukong
129
158
  def elasticsearch_hdfs_tmp_dir io
130
159
  cleaner = %r{[^\w/\.\-\+]+}
131
160
  io_part = [io.index, io.mapping].compact.map { |s| s.gsub(cleaner, '') }.join('/')
132
- File.join(settings[:es_tmp_dir], io_part, Time.now.strftime("%Y-%m-%d-%H-%M-%S"))
161
+ File.join(settings[:es_tmp_dir] || '/', io_part || '', Time.now.strftime("%Y-%m-%d-%H-%M-%S"))
133
162
  end
134
163
 
135
164
  end
@@ -1,3 +1,4 @@
1
1
  module Wonderdog
2
- VERSION = '0.0.1'
2
+ # The currently running Wonderdog version
3
+ VERSION = '0.0.2'
3
4
  end
@@ -7,75 +7,120 @@ describe Wukong::Elasticsearch::HadoopInvocationOverride do
7
7
  let(:es_writer) { driver('regexp', 'count', input: '/tmp/input_file', output: 'es:///the_index/the_map') }
8
8
  let(:es_complex) { driver('regexp', 'count', input: 'es://the_index/the_map', output: 'es:///the_index/the_map', es_query: '{"hi": "there"}', es_request_size: 1000, es_index_field: 'ID') }
9
9
 
10
- context "not interacting with Elasticsearch" do
11
- subject { no_es }
12
- # input
13
- its(:input_paths) { should == '/tmp/input_file' }
14
- its(:hadoop_commandline) { should match(%r{-input.*/tmp/input_file}i) }
15
-
16
- # output
17
- its(:output_path) { should == '/tmp/output_file' }
18
- its(:hadoop_commandline) { should match(%r{-output.*/tmp/output_file}i) }
19
-
20
- # no elasticsearch anything
21
- its(:hadoop_commandline) { should_not match(/elasticsearch/i) }
10
+ context "passing necessary jars to Hadoop streaming" do
11
+ before { Dir.stub!(:[]).and_return(["/lib/dir/elasticsearch.jar"], ["/lib/dir/wonderdog.jar"]) }
12
+ context "when not given explicit jars" do
13
+ context "and not interacting with Elasticsearch" do
14
+ it "doesn't add jars" do
15
+ no_es.hadoop_commandline.should_not match('-libjars')
16
+ end
17
+ end
18
+ context "and reading from Elasticsearch" do
19
+ it "adds default jars it finds on the local filesystem" do
20
+ es_reader.hadoop_commandline.should match('-libjars.*elasticsearch')
21
+ end
22
+ end
23
+ context "and writing to Elasticsearch" do
24
+ it "adds default jars it finds on the local filesystem" do
25
+ es_writer.hadoop_commandline.should match('-libjars.*elasticsearch')
26
+ end
27
+ end
28
+ context "and reading and writing to Elasticsearch" do
29
+ it "adds default jars it finds on the local filesystem" do
30
+ es_complex.hadoop_commandline.should match('-libjars.*elasticsearch')
31
+ end
32
+ end
33
+ end
22
34
  end
23
35
 
24
- context "reading from Elasticsearch" do
25
- subject { es_reader }
26
-
27
- # input
28
- its(:input_paths) { should match(%r{/user.*wukong.*the_index.*the_map}) }
29
- its(:hadoop_commandline) { should match(/-inputformat.*elasticsearch/i) }
30
- its(:hadoop_commandline) { should match(%r{-input.*/user.*wukong.*the_index.*the_map}i) }
31
- its(:hadoop_commandline) { should match(/-D\s+elasticsearch\.input\.index.*the_index/i) }
32
- its(:hadoop_commandline) { should match(/-D\s+elasticsearch\.input\.map.*the_map/i) }
33
-
34
- # output
35
- its(:output_path) { should == '/tmp/output_file' }
36
- its(:hadoop_commandline) { should_not match(/-outputformat/i) }
37
- its(:hadoop_commandline) { should match(%r{-output.*/tmp/output_file}i) }
38
- its(:hadoop_commandline) { should_not match(/-D\s+elasticsearch\.output/i) }
36
+ context "setting speculative execution" do
37
+ context "when not given speculative options" do
38
+ context "and not interacting with Elasticsearch" do
39
+ it "doesn't add jars" do
40
+ no_es.hadoop_commandline.should_not match('speculative')
41
+ end
42
+ end
43
+ context "and reading from Elasticsearch" do
44
+ it "adds default jars it finds on the local filesystem" do
45
+ es_reader.hadoop_commandline.should match('-mapred.map.tasks.speculative.execution.*false')
46
+ es_reader.hadoop_commandline.should match('-mapred.reduce.tasks.speculative.execution.*false')
47
+ end
48
+ end
49
+ end
39
50
  end
51
+
52
+ context "handling input and output paths, formats, and options when" do
40
53
 
41
- context "writing to Elasticsearch" do
42
- subject { es_writer }
54
+ context "not interacting with Elasticsearch" do
55
+ subject { no_es }
56
+ # input
57
+ its(:input_paths) { should == '/tmp/input_file' }
58
+ its(:hadoop_commandline) { should match(%r{-input.*/tmp/input_file}i) }
43
59
 
44
- # input
45
- its(:input_paths) { should == '/tmp/input_file' }
46
- its(:hadoop_commandline) { should_not match(/-inputformat/i) }
47
- its(:hadoop_commandline) { should match(%r{-input.*/tmp/input_file}i) }
48
- its(:hadoop_commandline) { should_not match(/-D\s+elasticsearch\.input/i) }
60
+ # output
61
+ its(:output_path) { should == '/tmp/output_file' }
62
+ its(:hadoop_commandline) { should match(%r{-output.*/tmp/output_file}i) }
49
63
 
50
- # output
51
- its(:output_path) { should match(%r{/user.*wukong.*the_index.*the_map}) }
52
- its(:hadoop_commandline) { should match(/-outputformat.*elasticsearch/i) }
53
- its(:hadoop_commandline) { should match(%r{-output.*/user.*wukong.*the_index.*the_map}i) }
54
- its(:hadoop_commandline) { should match(/-D\s+elasticsearch\.output\.index.*the_index/i) }
55
- its(:hadoop_commandline) { should match(/-D\s+elasticsearch\.output\.map.*the_map/i) }
56
- end
64
+ # no elasticsearch anything
65
+ its(:hadoop_commandline) { should_not match(/elasticsearch/i) }
66
+ end
67
+
68
+ context "reading from Elasticsearch" do
69
+ subject { es_reader }
70
+
71
+ # input
72
+ its(:input_paths) { should match(%r{/user.*wukong.*the_index.*the_map}) }
73
+ its(:hadoop_commandline) { should match(/-inputformat.*elasticsearch/i) }
74
+ its(:hadoop_commandline) { should match(%r{-input.*/user.*wukong.*the_index.*the_map}i) }
75
+ its(:hadoop_commandline) { should match(/-D\s+elasticsearch\.input\.index.*the_index/i) }
76
+ its(:hadoop_commandline) { should match(/-D\s+elasticsearch\.input\.map.*the_map/i) }
77
+
78
+ # output
79
+ its(:output_path) { should == '/tmp/output_file' }
80
+ its(:hadoop_commandline) { should_not match(/-outputformat/i) }
81
+ its(:hadoop_commandline) { should match(%r{-output.*/tmp/output_file}i) }
82
+ its(:hadoop_commandline) { should_not match(/-D\s+elasticsearch\.output/i) }
83
+ end
84
+
85
+ context "writing to Elasticsearch" do
86
+ subject { es_writer }
87
+
88
+ # input
89
+ its(:input_paths) { should == '/tmp/input_file' }
90
+ its(:hadoop_commandline) { should_not match(/-inputformat/i) }
91
+ its(:hadoop_commandline) { should match(%r{-input.*/tmp/input_file}i) }
92
+ its(:hadoop_commandline) { should_not match(/-D\s+elasticsearch\.input/i) }
93
+
94
+ # output
95
+ its(:output_path) { should match(%r{/user.*wukong.*the_index.*the_map}) }
96
+ its(:hadoop_commandline) { should match(/-outputformat.*elasticsearch/i) }
97
+ its(:hadoop_commandline) { should match(%r{-output.*/user.*wukong.*the_index.*the_map}i) }
98
+ its(:hadoop_commandline) { should match(/-D\s+elasticsearch\.output\.index.*the_index/i) }
99
+ its(:hadoop_commandline) { should match(/-D\s+elasticsearch\.output\.map.*the_map/i) }
100
+ end
57
101
 
58
- context "reading and writing with many options" do
59
- subject { es_complex }
102
+ context "reading and writing with many options" do
103
+ subject { es_complex }
60
104
 
61
- # input
62
- its(:input_paths) { should match(%r{/user.*wukong.*the_index.*the_map}) }
63
- its(:hadoop_commandline) { should match(/-inputformat.*elasticsearch/i) }
64
- its(:hadoop_commandline) { should match(%r{-input.*/user.*wukong.*the_index.*the_map}i) }
65
- its(:hadoop_commandline) { should match(/-D\s+elasticsearch\.input\.index.*the_index/i) }
66
- its(:hadoop_commandline) { should match(/-D\s+elasticsearch\.input\.map.*the_map/i) }
105
+ # input
106
+ its(:input_paths) { should match(%r{/user.*wukong.*the_index.*the_map}) }
107
+ its(:hadoop_commandline) { should match(/-inputformat.*elasticsearch/i) }
108
+ its(:hadoop_commandline) { should match(%r{-input.*/user.*wukong.*the_index.*the_map}i) }
109
+ its(:hadoop_commandline) { should match(/-D\s+elasticsearch\.input\.index.*the_index/i) }
110
+ its(:hadoop_commandline) { should match(/-D\s+elasticsearch\.input\.map.*the_map/i) }
67
111
 
68
- # output
69
- its(:output_path) { should match(%r{/user.*wukong.*the_index.*the_map}) }
70
- its(:hadoop_commandline) { should match(/-outputformat.*elasticsearch/i) }
71
- its(:hadoop_commandline) { should match(%r{-output.*/user.*wukong.*the_index.*the_map}i) }
72
- its(:hadoop_commandline) { should match(/-D\s+elasticsearch\.output\.index.*the_index/i) }
73
- its(:hadoop_commandline) { should match(/-D\s+elasticsearch\.output\.map.*the_map/i) }
112
+ # output
113
+ its(:output_path) { should match(%r{/user.*wukong.*the_index.*the_map}) }
114
+ its(:hadoop_commandline) { should match(/-outputformat.*elasticsearch/i) }
115
+ its(:hadoop_commandline) { should match(%r{-output.*/user.*wukong.*the_index.*the_map}i) }
116
+ its(:hadoop_commandline) { should match(/-D\s+elasticsearch\.output\.index.*the_index/i) }
117
+ its(:hadoop_commandline) { should match(/-D\s+elasticsearch\.output\.map.*the_map/i) }
74
118
 
75
- # options
76
- its(:hadoop_commandline) { should match(/-D\s+elasticsearch\.input\.query.*hi.*there/i) }
77
- its(:hadoop_commandline) { should match(/-D\s+elasticsearch\.input\.request_size.*1000/i) }
78
- its(:hadoop_commandline) { should match(/-D\s+elasticsearch\.output\.index\.field.*ID/i) }
119
+ # options
120
+ its(:hadoop_commandline) { should match(/-D\s+elasticsearch\.input\.query.*hi.*there/i) }
121
+ its(:hadoop_commandline) { should match(/-D\s+elasticsearch\.input\.request_size.*1000/i) }
122
+ its(:hadoop_commandline) { should match(/-D\s+elasticsearch\.output\.index\.field.*ID/i) }
123
+ end
79
124
  end
80
125
 
81
126
  end
@@ -48,7 +48,7 @@ public class ElasticSearchStreamingOutputFormat<K, V> implements OutputFormat<K,
48
48
  private String idFieldName;
49
49
 
50
50
  private static final String ES_BULK_SIZE_OPT = "elasticsearch.output.bulk_size";
51
- private static final String ES_BULK_SIZE = "100";
51
+ private static final String ES_BULK_SIZE = "1000";
52
52
  private int bulkSize;
53
53
 
54
54
 
@@ -28,5 +28,6 @@ EOF
28
28
  gem.test_files = gem.files.grep(/^spec/)
29
29
  gem.require_paths = ['lib']
30
30
 
31
- gem.add_dependency('wukong', '3.0.0.pre2')
31
+ gem.add_dependency('wukong', '3.0.0.pre3')
32
+ gem.add_dependency('wukong-hadoop', '>= 0.0.2')
32
33
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wonderdog
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -13,7 +13,7 @@ authors:
13
13
  autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
- date: 2012-12-01 00:00:00.000000000 Z
16
+ date: 2012-12-17 00:00:00.000000000 Z
17
17
  dependencies:
18
18
  - !ruby/object:Gem::Dependency
19
19
  name: wukong
@@ -22,7 +22,7 @@ dependencies:
22
22
  requirements:
23
23
  - - '='
24
24
  - !ruby/object:Gem::Version
25
- version: 3.0.0.pre2
25
+ version: 3.0.0.pre3
26
26
  type: :runtime
27
27
  prerelease: false
28
28
  version_requirements: !ruby/object:Gem::Requirement
@@ -30,7 +30,23 @@ dependencies:
30
30
  requirements:
31
31
  - - '='
32
32
  - !ruby/object:Gem::Version
33
- version: 3.0.0.pre2
33
+ version: 3.0.0.pre3
34
+ - !ruby/object:Gem::Dependency
35
+ name: wukong-hadoop
36
+ requirement: !ruby/object:Gem::Requirement
37
+ none: false
38
+ requirements:
39
+ - - ! '>='
40
+ - !ruby/object:Gem::Version
41
+ version: 0.0.2
42
+ type: :runtime
43
+ prerelease: false
44
+ version_requirements: !ruby/object:Gem::Requirement
45
+ none: false
46
+ requirements:
47
+ - - ! '>='
48
+ - !ruby/object:Gem::Version
49
+ version: 0.0.2
34
50
  description: ! " Wonderdog provides code in both Ruby and Java to make Elasticsearch\n
35
51
  \ a more fully-fledged member of both the Hadoop and Wukong\n ecosystems.\n\n For
36
52
  the Java side, Wonderdog provides InputFormat and OutputFormat\n classes for use
@@ -45,6 +61,8 @@ files:
45
61
  - .gitignore
46
62
  - .rspec
47
63
  - CHANGELOG.md
64
+ - Gemfile
65
+ - Gemfile.lock
48
66
  - LICENSE.md
49
67
  - README.md
50
68
  - Rakefile