wonderdog 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile +8 -0
- data/Gemfile.lock +57 -0
- data/lib/wonderdog/configuration.rb +1 -0
- data/lib/wonderdog/hadoop_invocation_override.rb +31 -2
- data/lib/wonderdog/version.rb +2 -1
- data/spec/wonderdog/hadoop_invocation_override_spec.rb +104 -59
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingOutputFormat.java +1 -1
- data/wonderdog.gemspec +2 -1
- metadata +22 -4
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,57 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
wonderdog (0.0.1)
|
5
|
+
wukong (= 3.0.0.pre3)
|
6
|
+
wukong-hadoop (>= 0.0.2)
|
7
|
+
|
8
|
+
GEM
|
9
|
+
remote: http://rubygems.org/
|
10
|
+
specs:
|
11
|
+
configliere (0.4.18)
|
12
|
+
highline (>= 1.5.2)
|
13
|
+
multi_json (>= 1.1)
|
14
|
+
diff-lcs (1.1.3)
|
15
|
+
eventmachine (1.0.0)
|
16
|
+
forgery (0.5.0)
|
17
|
+
gorillib (0.4.2)
|
18
|
+
configliere (>= 0.4.13)
|
19
|
+
json
|
20
|
+
multi_json (>= 1.1)
|
21
|
+
highline (1.6.15)
|
22
|
+
json (1.7.5)
|
23
|
+
log4r (1.1.10)
|
24
|
+
multi_json (1.5.0)
|
25
|
+
rake (0.9.6)
|
26
|
+
rspec (2.12.0)
|
27
|
+
rspec-core (~> 2.12.0)
|
28
|
+
rspec-expectations (~> 2.12.0)
|
29
|
+
rspec-mocks (~> 2.12.0)
|
30
|
+
rspec-core (2.12.2)
|
31
|
+
rspec-expectations (2.12.1)
|
32
|
+
diff-lcs (~> 1.1.3)
|
33
|
+
rspec-mocks (2.12.0)
|
34
|
+
uuidtools (2.1.3)
|
35
|
+
vayacondios-client (0.1.2)
|
36
|
+
configliere (>= 0.4.16)
|
37
|
+
gorillib (~> 0.4.2)
|
38
|
+
multi_json (~> 1.1)
|
39
|
+
wukong (3.0.0.pre3)
|
40
|
+
configliere (>= 0.4.18)
|
41
|
+
eventmachine
|
42
|
+
forgery
|
43
|
+
gorillib (>= 0.4.2)
|
44
|
+
log4r
|
45
|
+
multi_json (>= 1.3.6)
|
46
|
+
uuidtools
|
47
|
+
vayacondios-client (>= 0.1.2)
|
48
|
+
wukong-hadoop (0.0.2)
|
49
|
+
wukong (= 3.0.0.pre3)
|
50
|
+
|
51
|
+
PLATFORMS
|
52
|
+
ruby
|
53
|
+
|
54
|
+
DEPENDENCIES
|
55
|
+
rake (~> 0.9)
|
56
|
+
rspec (~> 2)
|
57
|
+
wonderdog!
|
@@ -8,6 +8,7 @@ module Wukong
|
|
8
8
|
# @return [Configliere::Param] the newly configured settings
|
9
9
|
def self.configure settings
|
10
10
|
settings.define(:es_tmp_dir, :description => "Temporary directory on the HDFS to store job files while reading/writing to ElasticSearch", :default => "/user/#{ENV['USER']}/wukong", :wukong_hadoop => true)
|
11
|
+
settings.define(:es_lib_dir, :description => "Directory containing Elasticsearch, Wonderdog, and other support jars", :default => "/usr/lib/hadoop/lib", :wukong_hadoop => true)
|
11
12
|
settings.define(:es_config, :description => "Where to find configuration files detailing how to join an ElasticSearch cluster", :wukong_hadoop => true)
|
12
13
|
settings.define(:es_input_splits, :description => "Number of input splits to target when reading from ElasticSearch", :type => Integer, :wukong_hadoop => true)
|
13
14
|
settings.define(:es_request_size, :description => "Number of objects requested during each batch read from ElasticSearch", :type => Integer, :wukong_hadoop => true)
|
@@ -98,8 +98,15 @@ module Wukong
|
|
98
98
|
#
|
99
99
|
# @return [Array<String>]
|
100
100
|
def hadoop_jobconf_options
|
101
|
+
if reads_from_elasticsearch? || writes_to_elasticsearch?
|
102
|
+
settings[:map_speculative] = false if settings[:map_speculative].nil?
|
103
|
+
settings[:reduce_speculative] = false if settings[:reduce_speculative].nil?
|
104
|
+
end
|
105
|
+
|
101
106
|
super() + [].tap do |o|
|
102
|
-
|
107
|
+
if (reads_from_elasticsearch? || writes_to_elasticsearch?)
|
108
|
+
o << java_opt('es.config', settings[:es_config])
|
109
|
+
end
|
103
110
|
|
104
111
|
if reads_from_elasticsearch?
|
105
112
|
o << java_opt('elasticsearch.input.index', input_index.index)
|
@@ -121,6 +128,28 @@ module Wukong
|
|
121
128
|
end.flatten.compact
|
122
129
|
end
|
123
130
|
|
131
|
+
# :nodoc:
|
132
|
+
#
|
133
|
+
# Munge the settings object to add necessary jars if
|
134
|
+
# reading/writing to/from Elasticsearch, then call super().
|
135
|
+
def hadoop_files
|
136
|
+
if reads_from_elasticsearch? || writes_to_elasticsearch?
|
137
|
+
settings[:jars] = elasticsearch_jars if settings[:jars].empty?
|
138
|
+
end
|
139
|
+
super()
|
140
|
+
end
|
141
|
+
|
142
|
+
# All Elasticsearch, Wonderdog, and other support jars needed to
|
143
|
+
# connect Hadoop streaming with the
|
144
|
+
# ElasticSearchStreamingInputFormat and
|
145
|
+
# ElasticSearchStreamingOutputFormat provided by the Wonderdog
|
146
|
+
# Java code.
|
147
|
+
#
|
148
|
+
# @return [Array<String>]
|
149
|
+
def elasticsearch_jars
|
150
|
+
Dir[File.join(settings[:es_lib_dir] || '/usr/lib/hadoop/lib', '{elasticsearch,lucene,jna,wonderdog}*.jar')].compact.uniq
|
151
|
+
end
|
152
|
+
|
124
153
|
# Returns a temporary path on the HDFS in which to store log
|
125
154
|
# data while the Hadoop job runs.
|
126
155
|
#
|
@@ -129,7 +158,7 @@ module Wukong
|
|
129
158
|
def elasticsearch_hdfs_tmp_dir io
|
130
159
|
cleaner = %r{[^\w/\.\-\+]+}
|
131
160
|
io_part = [io.index, io.mapping].compact.map { |s| s.gsub(cleaner, '') }.join('/')
|
132
|
-
File.join(settings[:es_tmp_dir], io_part, Time.now.strftime("%Y-%m-%d-%H-%M-%S"))
|
161
|
+
File.join(settings[:es_tmp_dir] || '/', io_part || '', Time.now.strftime("%Y-%m-%d-%H-%M-%S"))
|
133
162
|
end
|
134
163
|
|
135
164
|
end
|
data/lib/wonderdog/version.rb
CHANGED
@@ -7,75 +7,120 @@ describe Wukong::Elasticsearch::HadoopInvocationOverride do
|
|
7
7
|
let(:es_writer) { driver('regexp', 'count', input: '/tmp/input_file', output: 'es:///the_index/the_map') }
|
8
8
|
let(:es_complex) { driver('regexp', 'count', input: 'es://the_index/the_map', output: 'es:///the_index/the_map', es_query: '{"hi": "there"}', es_request_size: 1000, es_index_field: 'ID') }
|
9
9
|
|
10
|
-
context "
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
10
|
+
context "passing necessary jars to Hadoop streaming" do
|
11
|
+
before { Dir.stub!(:[]).and_return(["/lib/dir/elasticsearch.jar"], ["/lib/dir/wonderdog.jar"]) }
|
12
|
+
context "when not given explicit jars" do
|
13
|
+
context "and not interacting with Elasticsearch" do
|
14
|
+
it "doesn't add jars" do
|
15
|
+
no_es.hadoop_commandline.should_not match('-libjars')
|
16
|
+
end
|
17
|
+
end
|
18
|
+
context "and reading from Elasticsearch" do
|
19
|
+
it "adds default jars it finds on the local filesystem" do
|
20
|
+
es_reader.hadoop_commandline.should match('-libjars.*elasticsearch')
|
21
|
+
end
|
22
|
+
end
|
23
|
+
context "and writing to Elasticsearch" do
|
24
|
+
it "adds default jars it finds on the local filesystem" do
|
25
|
+
es_writer.hadoop_commandline.should match('-libjars.*elasticsearch')
|
26
|
+
end
|
27
|
+
end
|
28
|
+
context "and reading and writing to Elasticsearch" do
|
29
|
+
it "adds default jars it finds on the local filesystem" do
|
30
|
+
es_complex.hadoop_commandline.should match('-libjars.*elasticsearch')
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
22
34
|
end
|
23
35
|
|
24
|
-
context "
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
its(:hadoop_commandline) { should_not match(/-D\s+elasticsearch\.output/i) }
|
36
|
+
context "setting speculative execution" do
|
37
|
+
context "when not given speculative options" do
|
38
|
+
context "and not interacting with Elasticsearch" do
|
39
|
+
it "doesn't add jars" do
|
40
|
+
no_es.hadoop_commandline.should_not match('speculative')
|
41
|
+
end
|
42
|
+
end
|
43
|
+
context "and reading from Elasticsearch" do
|
44
|
+
it "adds default jars it finds on the local filesystem" do
|
45
|
+
es_reader.hadoop_commandline.should match('-mapred.map.tasks.speculative.execution.*false')
|
46
|
+
es_reader.hadoop_commandline.should match('-mapred.reduce.tasks.speculative.execution.*false')
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
39
50
|
end
|
51
|
+
|
52
|
+
context "handling input and output paths, formats, and options when" do
|
40
53
|
|
41
|
-
|
42
|
-
|
54
|
+
context "not interacting with Elasticsearch" do
|
55
|
+
subject { no_es }
|
56
|
+
# input
|
57
|
+
its(:input_paths) { should == '/tmp/input_file' }
|
58
|
+
its(:hadoop_commandline) { should match(%r{-input.*/tmp/input_file}i) }
|
43
59
|
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
its(:hadoop_commandline) { should match(%r{-input.*/tmp/input_file}i) }
|
48
|
-
its(:hadoop_commandline) { should_not match(/-D\s+elasticsearch\.input/i) }
|
60
|
+
# output
|
61
|
+
its(:output_path) { should == '/tmp/output_file' }
|
62
|
+
its(:hadoop_commandline) { should match(%r{-output.*/tmp/output_file}i) }
|
49
63
|
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
64
|
+
# no elasticsearch anything
|
65
|
+
its(:hadoop_commandline) { should_not match(/elasticsearch/i) }
|
66
|
+
end
|
67
|
+
|
68
|
+
context "reading from Elasticsearch" do
|
69
|
+
subject { es_reader }
|
70
|
+
|
71
|
+
# input
|
72
|
+
its(:input_paths) { should match(%r{/user.*wukong.*the_index.*the_map}) }
|
73
|
+
its(:hadoop_commandline) { should match(/-inputformat.*elasticsearch/i) }
|
74
|
+
its(:hadoop_commandline) { should match(%r{-input.*/user.*wukong.*the_index.*the_map}i) }
|
75
|
+
its(:hadoop_commandline) { should match(/-D\s+elasticsearch\.input\.index.*the_index/i) }
|
76
|
+
its(:hadoop_commandline) { should match(/-D\s+elasticsearch\.input\.map.*the_map/i) }
|
77
|
+
|
78
|
+
# output
|
79
|
+
its(:output_path) { should == '/tmp/output_file' }
|
80
|
+
its(:hadoop_commandline) { should_not match(/-outputformat/i) }
|
81
|
+
its(:hadoop_commandline) { should match(%r{-output.*/tmp/output_file}i) }
|
82
|
+
its(:hadoop_commandline) { should_not match(/-D\s+elasticsearch\.output/i) }
|
83
|
+
end
|
84
|
+
|
85
|
+
context "writing to Elasticsearch" do
|
86
|
+
subject { es_writer }
|
87
|
+
|
88
|
+
# input
|
89
|
+
its(:input_paths) { should == '/tmp/input_file' }
|
90
|
+
its(:hadoop_commandline) { should_not match(/-inputformat/i) }
|
91
|
+
its(:hadoop_commandline) { should match(%r{-input.*/tmp/input_file}i) }
|
92
|
+
its(:hadoop_commandline) { should_not match(/-D\s+elasticsearch\.input/i) }
|
93
|
+
|
94
|
+
# output
|
95
|
+
its(:output_path) { should match(%r{/user.*wukong.*the_index.*the_map}) }
|
96
|
+
its(:hadoop_commandline) { should match(/-outputformat.*elasticsearch/i) }
|
97
|
+
its(:hadoop_commandline) { should match(%r{-output.*/user.*wukong.*the_index.*the_map}i) }
|
98
|
+
its(:hadoop_commandline) { should match(/-D\s+elasticsearch\.output\.index.*the_index/i) }
|
99
|
+
its(:hadoop_commandline) { should match(/-D\s+elasticsearch\.output\.map.*the_map/i) }
|
100
|
+
end
|
57
101
|
|
58
|
-
|
59
|
-
|
102
|
+
context "reading and writing with many options" do
|
103
|
+
subject { es_complex }
|
60
104
|
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
105
|
+
# input
|
106
|
+
its(:input_paths) { should match(%r{/user.*wukong.*the_index.*the_map}) }
|
107
|
+
its(:hadoop_commandline) { should match(/-inputformat.*elasticsearch/i) }
|
108
|
+
its(:hadoop_commandline) { should match(%r{-input.*/user.*wukong.*the_index.*the_map}i) }
|
109
|
+
its(:hadoop_commandline) { should match(/-D\s+elasticsearch\.input\.index.*the_index/i) }
|
110
|
+
its(:hadoop_commandline) { should match(/-D\s+elasticsearch\.input\.map.*the_map/i) }
|
67
111
|
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
112
|
+
# output
|
113
|
+
its(:output_path) { should match(%r{/user.*wukong.*the_index.*the_map}) }
|
114
|
+
its(:hadoop_commandline) { should match(/-outputformat.*elasticsearch/i) }
|
115
|
+
its(:hadoop_commandline) { should match(%r{-output.*/user.*wukong.*the_index.*the_map}i) }
|
116
|
+
its(:hadoop_commandline) { should match(/-D\s+elasticsearch\.output\.index.*the_index/i) }
|
117
|
+
its(:hadoop_commandline) { should match(/-D\s+elasticsearch\.output\.map.*the_map/i) }
|
74
118
|
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
119
|
+
# options
|
120
|
+
its(:hadoop_commandline) { should match(/-D\s+elasticsearch\.input\.query.*hi.*there/i) }
|
121
|
+
its(:hadoop_commandline) { should match(/-D\s+elasticsearch\.input\.request_size.*1000/i) }
|
122
|
+
its(:hadoop_commandline) { should match(/-D\s+elasticsearch\.output\.index\.field.*ID/i) }
|
123
|
+
end
|
79
124
|
end
|
80
125
|
|
81
126
|
end
|
@@ -48,7 +48,7 @@ public class ElasticSearchStreamingOutputFormat<K, V> implements OutputFormat<K,
|
|
48
48
|
private String idFieldName;
|
49
49
|
|
50
50
|
private static final String ES_BULK_SIZE_OPT = "elasticsearch.output.bulk_size";
|
51
|
-
private static final String ES_BULK_SIZE = "
|
51
|
+
private static final String ES_BULK_SIZE = "1000";
|
52
52
|
private int bulkSize;
|
53
53
|
|
54
54
|
|
data/wonderdog.gemspec
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wonderdog
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -13,7 +13,7 @@ authors:
|
|
13
13
|
autorequire:
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
|
-
date: 2012-12-
|
16
|
+
date: 2012-12-17 00:00:00.000000000 Z
|
17
17
|
dependencies:
|
18
18
|
- !ruby/object:Gem::Dependency
|
19
19
|
name: wukong
|
@@ -22,7 +22,7 @@ dependencies:
|
|
22
22
|
requirements:
|
23
23
|
- - '='
|
24
24
|
- !ruby/object:Gem::Version
|
25
|
-
version: 3.0.0.
|
25
|
+
version: 3.0.0.pre3
|
26
26
|
type: :runtime
|
27
27
|
prerelease: false
|
28
28
|
version_requirements: !ruby/object:Gem::Requirement
|
@@ -30,7 +30,23 @@ dependencies:
|
|
30
30
|
requirements:
|
31
31
|
- - '='
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: 3.0.0.
|
33
|
+
version: 3.0.0.pre3
|
34
|
+
- !ruby/object:Gem::Dependency
|
35
|
+
name: wukong-hadoop
|
36
|
+
requirement: !ruby/object:Gem::Requirement
|
37
|
+
none: false
|
38
|
+
requirements:
|
39
|
+
- - ! '>='
|
40
|
+
- !ruby/object:Gem::Version
|
41
|
+
version: 0.0.2
|
42
|
+
type: :runtime
|
43
|
+
prerelease: false
|
44
|
+
version_requirements: !ruby/object:Gem::Requirement
|
45
|
+
none: false
|
46
|
+
requirements:
|
47
|
+
- - ! '>='
|
48
|
+
- !ruby/object:Gem::Version
|
49
|
+
version: 0.0.2
|
34
50
|
description: ! " Wonderdog provides code in both Ruby and Java to make Elasticsearch\n
|
35
51
|
\ a more fully-fledged member of both the Hadoop and Wukong\n ecosystems.\n\n For
|
36
52
|
the Java side, Wonderdog provides InputFormat and OutputFormat\n classes for use
|
@@ -45,6 +61,8 @@ files:
|
|
45
61
|
- .gitignore
|
46
62
|
- .rspec
|
47
63
|
- CHANGELOG.md
|
64
|
+
- Gemfile
|
65
|
+
- Gemfile.lock
|
48
66
|
- LICENSE.md
|
49
67
|
- README.md
|
50
68
|
- Rakefile
|