wonderdog 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +8 -0
- data/Gemfile.lock +57 -0
- data/lib/wonderdog/configuration.rb +1 -0
- data/lib/wonderdog/hadoop_invocation_override.rb +31 -2
- data/lib/wonderdog/version.rb +2 -1
- data/spec/wonderdog/hadoop_invocation_override_spec.rb +104 -59
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingOutputFormat.java +1 -1
- data/wonderdog.gemspec +2 -1
- metadata +22 -4
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,57 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
wonderdog (0.0.1)
|
5
|
+
wukong (= 3.0.0.pre3)
|
6
|
+
wukong-hadoop (>= 0.0.2)
|
7
|
+
|
8
|
+
GEM
|
9
|
+
remote: http://rubygems.org/
|
10
|
+
specs:
|
11
|
+
configliere (0.4.18)
|
12
|
+
highline (>= 1.5.2)
|
13
|
+
multi_json (>= 1.1)
|
14
|
+
diff-lcs (1.1.3)
|
15
|
+
eventmachine (1.0.0)
|
16
|
+
forgery (0.5.0)
|
17
|
+
gorillib (0.4.2)
|
18
|
+
configliere (>= 0.4.13)
|
19
|
+
json
|
20
|
+
multi_json (>= 1.1)
|
21
|
+
highline (1.6.15)
|
22
|
+
json (1.7.5)
|
23
|
+
log4r (1.1.10)
|
24
|
+
multi_json (1.5.0)
|
25
|
+
rake (0.9.6)
|
26
|
+
rspec (2.12.0)
|
27
|
+
rspec-core (~> 2.12.0)
|
28
|
+
rspec-expectations (~> 2.12.0)
|
29
|
+
rspec-mocks (~> 2.12.0)
|
30
|
+
rspec-core (2.12.2)
|
31
|
+
rspec-expectations (2.12.1)
|
32
|
+
diff-lcs (~> 1.1.3)
|
33
|
+
rspec-mocks (2.12.0)
|
34
|
+
uuidtools (2.1.3)
|
35
|
+
vayacondios-client (0.1.2)
|
36
|
+
configliere (>= 0.4.16)
|
37
|
+
gorillib (~> 0.4.2)
|
38
|
+
multi_json (~> 1.1)
|
39
|
+
wukong (3.0.0.pre3)
|
40
|
+
configliere (>= 0.4.18)
|
41
|
+
eventmachine
|
42
|
+
forgery
|
43
|
+
gorillib (>= 0.4.2)
|
44
|
+
log4r
|
45
|
+
multi_json (>= 1.3.6)
|
46
|
+
uuidtools
|
47
|
+
vayacondios-client (>= 0.1.2)
|
48
|
+
wukong-hadoop (0.0.2)
|
49
|
+
wukong (= 3.0.0.pre3)
|
50
|
+
|
51
|
+
PLATFORMS
|
52
|
+
ruby
|
53
|
+
|
54
|
+
DEPENDENCIES
|
55
|
+
rake (~> 0.9)
|
56
|
+
rspec (~> 2)
|
57
|
+
wonderdog!
|
@@ -8,6 +8,7 @@ module Wukong
|
|
8
8
|
# @return [Configliere::Param] the newly configured settings
|
9
9
|
def self.configure settings
|
10
10
|
settings.define(:es_tmp_dir, :description => "Temporary directory on the HDFS to store job files while reading/writing to ElasticSearch", :default => "/user/#{ENV['USER']}/wukong", :wukong_hadoop => true)
|
11
|
+
settings.define(:es_lib_dir, :description => "Directory containing Elasticsearch, Wonderdog, and other support jars", :default => "/usr/lib/hadoop/lib", :wukong_hadoop => true)
|
11
12
|
settings.define(:es_config, :description => "Where to find configuration files detailing how to join an ElasticSearch cluster", :wukong_hadoop => true)
|
12
13
|
settings.define(:es_input_splits, :description => "Number of input splits to target when reading from ElasticSearch", :type => Integer, :wukong_hadoop => true)
|
13
14
|
settings.define(:es_request_size, :description => "Number of objects requested during each batch read from ElasticSearch", :type => Integer, :wukong_hadoop => true)
|
@@ -98,8 +98,15 @@ module Wukong
|
|
98
98
|
#
|
99
99
|
# @return [Array<String>]
|
100
100
|
def hadoop_jobconf_options
|
101
|
+
if reads_from_elasticsearch? || writes_to_elasticsearch?
|
102
|
+
settings[:map_speculative] = false if settings[:map_speculative].nil?
|
103
|
+
settings[:reduce_speculative] = false if settings[:reduce_speculative].nil?
|
104
|
+
end
|
105
|
+
|
101
106
|
super() + [].tap do |o|
|
102
|
-
|
107
|
+
if (reads_from_elasticsearch? || writes_to_elasticsearch?)
|
108
|
+
o << java_opt('es.config', settings[:es_config])
|
109
|
+
end
|
103
110
|
|
104
111
|
if reads_from_elasticsearch?
|
105
112
|
o << java_opt('elasticsearch.input.index', input_index.index)
|
@@ -121,6 +128,28 @@ module Wukong
|
|
121
128
|
end.flatten.compact
|
122
129
|
end
|
123
130
|
|
131
|
+
# :nodoc:
|
132
|
+
#
|
133
|
+
# Munge the settings object to add necessary jars if
|
134
|
+
# reading/writing to/from Elasticsearch, then call super().
|
135
|
+
def hadoop_files
|
136
|
+
if reads_from_elasticsearch? || writes_to_elasticsearch?
|
137
|
+
settings[:jars] = elasticsearch_jars if settings[:jars].empty?
|
138
|
+
end
|
139
|
+
super()
|
140
|
+
end
|
141
|
+
|
142
|
+
# All Elasticsearch, Wonderdog, and other support jars needed to
|
143
|
+
# connect Hadoop streaming with the
|
144
|
+
# ElasticSearchStreamingInputFormat and
|
145
|
+
# ElasticSearchStreamingOutputFormat provided by the Wonderdog
|
146
|
+
# Java code.
|
147
|
+
#
|
148
|
+
# @return [Array<String>]
|
149
|
+
def elasticsearch_jars
|
150
|
+
Dir[File.join(settings[:es_lib_dir] || '/usr/lib/hadoop/lib', '{elasticsearch,lucene,jna,wonderdog}*.jar')].compact.uniq
|
151
|
+
end
|
152
|
+
|
124
153
|
# Returns a temporary path on the HDFS in which to store log
|
125
154
|
# data while the Hadoop job runs.
|
126
155
|
#
|
@@ -129,7 +158,7 @@ module Wukong
|
|
129
158
|
def elasticsearch_hdfs_tmp_dir io
|
130
159
|
cleaner = %r{[^\w/\.\-\+]+}
|
131
160
|
io_part = [io.index, io.mapping].compact.map { |s| s.gsub(cleaner, '') }.join('/')
|
132
|
-
File.join(settings[:es_tmp_dir], io_part, Time.now.strftime("%Y-%m-%d-%H-%M-%S"))
|
161
|
+
File.join(settings[:es_tmp_dir] || '/', io_part || '', Time.now.strftime("%Y-%m-%d-%H-%M-%S"))
|
133
162
|
end
|
134
163
|
|
135
164
|
end
|
data/lib/wonderdog/version.rb
CHANGED
@@ -7,75 +7,120 @@ describe Wukong::Elasticsearch::HadoopInvocationOverride do
|
|
7
7
|
let(:es_writer) { driver('regexp', 'count', input: '/tmp/input_file', output: 'es:///the_index/the_map') }
|
8
8
|
let(:es_complex) { driver('regexp', 'count', input: 'es://the_index/the_map', output: 'es:///the_index/the_map', es_query: '{"hi": "there"}', es_request_size: 1000, es_index_field: 'ID') }
|
9
9
|
|
10
|
-
context "
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
10
|
+
context "passing necessary jars to Hadoop streaming" do
|
11
|
+
before { Dir.stub!(:[]).and_return(["/lib/dir/elasticsearch.jar"], ["/lib/dir/wonderdog.jar"]) }
|
12
|
+
context "when not given explicit jars" do
|
13
|
+
context "and not interacting with Elasticsearch" do
|
14
|
+
it "doesn't add jars" do
|
15
|
+
no_es.hadoop_commandline.should_not match('-libjars')
|
16
|
+
end
|
17
|
+
end
|
18
|
+
context "and reading from Elasticsearch" do
|
19
|
+
it "adds default jars it finds on the local filesystem" do
|
20
|
+
es_reader.hadoop_commandline.should match('-libjars.*elasticsearch')
|
21
|
+
end
|
22
|
+
end
|
23
|
+
context "and writing to Elasticsearch" do
|
24
|
+
it "adds default jars it finds on the local filesystem" do
|
25
|
+
es_writer.hadoop_commandline.should match('-libjars.*elasticsearch')
|
26
|
+
end
|
27
|
+
end
|
28
|
+
context "and reading and writing to Elasticsearch" do
|
29
|
+
it "adds default jars it finds on the local filesystem" do
|
30
|
+
es_complex.hadoop_commandline.should match('-libjars.*elasticsearch')
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
22
34
|
end
|
23
35
|
|
24
|
-
context "
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
its(:hadoop_commandline) { should_not match(/-D\s+elasticsearch\.output/i) }
|
36
|
+
context "setting speculative execution" do
|
37
|
+
context "when not given speculative options" do
|
38
|
+
context "and not interacting with Elasticsearch" do
|
39
|
+
it "doesn't add jars" do
|
40
|
+
no_es.hadoop_commandline.should_not match('speculative')
|
41
|
+
end
|
42
|
+
end
|
43
|
+
context "and reading from Elasticsearch" do
|
44
|
+
it "adds default jars it finds on the local filesystem" do
|
45
|
+
es_reader.hadoop_commandline.should match('-mapred.map.tasks.speculative.execution.*false')
|
46
|
+
es_reader.hadoop_commandline.should match('-mapred.reduce.tasks.speculative.execution.*false')
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
39
50
|
end
|
51
|
+
|
52
|
+
context "handling input and output paths, formats, and options when" do
|
40
53
|
|
41
|
-
|
42
|
-
|
54
|
+
context "not interacting with Elasticsearch" do
|
55
|
+
subject { no_es }
|
56
|
+
# input
|
57
|
+
its(:input_paths) { should == '/tmp/input_file' }
|
58
|
+
its(:hadoop_commandline) { should match(%r{-input.*/tmp/input_file}i) }
|
43
59
|
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
its(:hadoop_commandline) { should match(%r{-input.*/tmp/input_file}i) }
|
48
|
-
its(:hadoop_commandline) { should_not match(/-D\s+elasticsearch\.input/i) }
|
60
|
+
# output
|
61
|
+
its(:output_path) { should == '/tmp/output_file' }
|
62
|
+
its(:hadoop_commandline) { should match(%r{-output.*/tmp/output_file}i) }
|
49
63
|
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
64
|
+
# no elasticsearch anything
|
65
|
+
its(:hadoop_commandline) { should_not match(/elasticsearch/i) }
|
66
|
+
end
|
67
|
+
|
68
|
+
context "reading from Elasticsearch" do
|
69
|
+
subject { es_reader }
|
70
|
+
|
71
|
+
# input
|
72
|
+
its(:input_paths) { should match(%r{/user.*wukong.*the_index.*the_map}) }
|
73
|
+
its(:hadoop_commandline) { should match(/-inputformat.*elasticsearch/i) }
|
74
|
+
its(:hadoop_commandline) { should match(%r{-input.*/user.*wukong.*the_index.*the_map}i) }
|
75
|
+
its(:hadoop_commandline) { should match(/-D\s+elasticsearch\.input\.index.*the_index/i) }
|
76
|
+
its(:hadoop_commandline) { should match(/-D\s+elasticsearch\.input\.map.*the_map/i) }
|
77
|
+
|
78
|
+
# output
|
79
|
+
its(:output_path) { should == '/tmp/output_file' }
|
80
|
+
its(:hadoop_commandline) { should_not match(/-outputformat/i) }
|
81
|
+
its(:hadoop_commandline) { should match(%r{-output.*/tmp/output_file}i) }
|
82
|
+
its(:hadoop_commandline) { should_not match(/-D\s+elasticsearch\.output/i) }
|
83
|
+
end
|
84
|
+
|
85
|
+
context "writing to Elasticsearch" do
|
86
|
+
subject { es_writer }
|
87
|
+
|
88
|
+
# input
|
89
|
+
its(:input_paths) { should == '/tmp/input_file' }
|
90
|
+
its(:hadoop_commandline) { should_not match(/-inputformat/i) }
|
91
|
+
its(:hadoop_commandline) { should match(%r{-input.*/tmp/input_file}i) }
|
92
|
+
its(:hadoop_commandline) { should_not match(/-D\s+elasticsearch\.input/i) }
|
93
|
+
|
94
|
+
# output
|
95
|
+
its(:output_path) { should match(%r{/user.*wukong.*the_index.*the_map}) }
|
96
|
+
its(:hadoop_commandline) { should match(/-outputformat.*elasticsearch/i) }
|
97
|
+
its(:hadoop_commandline) { should match(%r{-output.*/user.*wukong.*the_index.*the_map}i) }
|
98
|
+
its(:hadoop_commandline) { should match(/-D\s+elasticsearch\.output\.index.*the_index/i) }
|
99
|
+
its(:hadoop_commandline) { should match(/-D\s+elasticsearch\.output\.map.*the_map/i) }
|
100
|
+
end
|
57
101
|
|
58
|
-
|
59
|
-
|
102
|
+
context "reading and writing with many options" do
|
103
|
+
subject { es_complex }
|
60
104
|
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
105
|
+
# input
|
106
|
+
its(:input_paths) { should match(%r{/user.*wukong.*the_index.*the_map}) }
|
107
|
+
its(:hadoop_commandline) { should match(/-inputformat.*elasticsearch/i) }
|
108
|
+
its(:hadoop_commandline) { should match(%r{-input.*/user.*wukong.*the_index.*the_map}i) }
|
109
|
+
its(:hadoop_commandline) { should match(/-D\s+elasticsearch\.input\.index.*the_index/i) }
|
110
|
+
its(:hadoop_commandline) { should match(/-D\s+elasticsearch\.input\.map.*the_map/i) }
|
67
111
|
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
112
|
+
# output
|
113
|
+
its(:output_path) { should match(%r{/user.*wukong.*the_index.*the_map}) }
|
114
|
+
its(:hadoop_commandline) { should match(/-outputformat.*elasticsearch/i) }
|
115
|
+
its(:hadoop_commandline) { should match(%r{-output.*/user.*wukong.*the_index.*the_map}i) }
|
116
|
+
its(:hadoop_commandline) { should match(/-D\s+elasticsearch\.output\.index.*the_index/i) }
|
117
|
+
its(:hadoop_commandline) { should match(/-D\s+elasticsearch\.output\.map.*the_map/i) }
|
74
118
|
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
119
|
+
# options
|
120
|
+
its(:hadoop_commandline) { should match(/-D\s+elasticsearch\.input\.query.*hi.*there/i) }
|
121
|
+
its(:hadoop_commandline) { should match(/-D\s+elasticsearch\.input\.request_size.*1000/i) }
|
122
|
+
its(:hadoop_commandline) { should match(/-D\s+elasticsearch\.output\.index\.field.*ID/i) }
|
123
|
+
end
|
79
124
|
end
|
80
125
|
|
81
126
|
end
|
@@ -48,7 +48,7 @@ public class ElasticSearchStreamingOutputFormat<K, V> implements OutputFormat<K,
|
|
48
48
|
private String idFieldName;
|
49
49
|
|
50
50
|
private static final String ES_BULK_SIZE_OPT = "elasticsearch.output.bulk_size";
|
51
|
-
private static final String ES_BULK_SIZE = "
|
51
|
+
private static final String ES_BULK_SIZE = "1000";
|
52
52
|
private int bulkSize;
|
53
53
|
|
54
54
|
|
data/wonderdog.gemspec
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wonderdog
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -13,7 +13,7 @@ authors:
|
|
13
13
|
autorequire:
|
14
14
|
bindir: bin
|
15
15
|
cert_chain: []
|
16
|
-
date: 2012-12-
|
16
|
+
date: 2012-12-17 00:00:00.000000000 Z
|
17
17
|
dependencies:
|
18
18
|
- !ruby/object:Gem::Dependency
|
19
19
|
name: wukong
|
@@ -22,7 +22,7 @@ dependencies:
|
|
22
22
|
requirements:
|
23
23
|
- - '='
|
24
24
|
- !ruby/object:Gem::Version
|
25
|
-
version: 3.0.0.
|
25
|
+
version: 3.0.0.pre3
|
26
26
|
type: :runtime
|
27
27
|
prerelease: false
|
28
28
|
version_requirements: !ruby/object:Gem::Requirement
|
@@ -30,7 +30,23 @@ dependencies:
|
|
30
30
|
requirements:
|
31
31
|
- - '='
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: 3.0.0.
|
33
|
+
version: 3.0.0.pre3
|
34
|
+
- !ruby/object:Gem::Dependency
|
35
|
+
name: wukong-hadoop
|
36
|
+
requirement: !ruby/object:Gem::Requirement
|
37
|
+
none: false
|
38
|
+
requirements:
|
39
|
+
- - ! '>='
|
40
|
+
- !ruby/object:Gem::Version
|
41
|
+
version: 0.0.2
|
42
|
+
type: :runtime
|
43
|
+
prerelease: false
|
44
|
+
version_requirements: !ruby/object:Gem::Requirement
|
45
|
+
none: false
|
46
|
+
requirements:
|
47
|
+
- - ! '>='
|
48
|
+
- !ruby/object:Gem::Version
|
49
|
+
version: 0.0.2
|
34
50
|
description: ! " Wonderdog provides code in both Ruby and Java to make Elasticsearch\n
|
35
51
|
\ a more fully-fledged member of both the Hadoop and Wukong\n ecosystems.\n\n For
|
36
52
|
the Java side, Wonderdog provides InputFormat and OutputFormat\n classes for use
|
@@ -45,6 +61,8 @@ files:
|
|
45
61
|
- .gitignore
|
46
62
|
- .rspec
|
47
63
|
- CHANGELOG.md
|
64
|
+
- Gemfile
|
65
|
+
- Gemfile.lock
|
48
66
|
- LICENSE.md
|
49
67
|
- README.md
|
50
68
|
- Rakefile
|