wonderdog 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +49 -0
- data/.rspec +2 -0
- data/CHANGELOG.md +5 -0
- data/LICENSE.md +201 -0
- data/README.md +175 -0
- data/Rakefile +10 -0
- data/bin/estool +141 -0
- data/bin/estrus.rb +136 -0
- data/bin/wonderdog +93 -0
- data/config/elasticsearch-example.yml +227 -0
- data/config/elasticsearch.in.sh +52 -0
- data/config/logging.yml +43 -0
- data/config/more_settings.yml +60 -0
- data/config/run_elasticsearch-2.sh +42 -0
- data/config/ufo_config.json +12 -0
- data/lib/wonderdog.rb +14 -0
- data/lib/wonderdog/configuration.rb +25 -0
- data/lib/wonderdog/hadoop_invocation_override.rb +139 -0
- data/lib/wonderdog/index_and_mapping.rb +67 -0
- data/lib/wonderdog/timestamp.rb +43 -0
- data/lib/wonderdog/version.rb +3 -0
- data/notes/README-benchmarking.txt +272 -0
- data/notes/README-read_tuning.textile +74 -0
- data/notes/benchmarking-201011.numbers +0 -0
- data/notes/cluster_notes.md +17 -0
- data/notes/notes.txt +91 -0
- data/notes/pigstorefunc.pig +45 -0
- data/pom.xml +80 -0
- data/spec/spec_helper.rb +22 -0
- data/spec/support/driver_helper.rb +15 -0
- data/spec/support/integration_helper.rb +30 -0
- data/spec/wonderdog/hadoop_invocation_override_spec.rb +81 -0
- data/spec/wonderdog/index_and_type_spec.rb +73 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchInputFormat.java +268 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchOutputCommitter.java +39 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchOutputFormat.java +283 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchSplit.java +60 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingInputFormat.java +231 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingOutputCommitter.java +37 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingOutputFormat.java +88 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingRecordReader.java +176 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingRecordWriter.java +171 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingSplit.java +102 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticTest.java +108 -0
- data/src/main/java/com/infochimps/elasticsearch/hadoop/util/HadoopUtils.java +100 -0
- data/src/main/java/com/infochimps/elasticsearch/pig/ElasticSearchIndex.java +216 -0
- data/src/main/java/com/infochimps/elasticsearch/pig/ElasticSearchJsonIndex.java +235 -0
- data/src/main/java/com/infochimps/elasticsearch/pig/ElasticSearchStorage.java +355 -0
- data/test/foo.json +3 -0
- data/test/foo.tsv +3 -0
- data/test/test_dump.pig +19 -0
- data/test/test_json_loader.pig +21 -0
- data/test/test_tsv_loader.pig +16 -0
- data/wonderdog.gemspec +32 -0
- metadata +130 -0
@@ -0,0 +1,45 @@
|
|
1
|
+
--
|
2
|
+
-- Doesn't work at the moment, just some notes on how the storefunc might look.
|
3
|
+
--
|
4
|
+
|
5
|
+
|
6
|
+
--
|
7
|
+
-- Right now the ElasticSearchOutputFormat gets all its options from the
|
8
|
+
-- Job object. We can use the call to setStoreLocation in the storefunc
|
9
|
+
-- to set the required parameters. Need to make sure the following are
|
10
|
+
-- set:
|
11
|
+
--
|
12
|
+
-- wonderdog.index.name - should be set by the storefunc constructor
|
13
|
+
-- wonderdog.bulk.size - should be set by the storefunc constructor
|
14
|
+
-- wonderdog.field.names - should be set by the call to checkSchema
|
15
|
+
-- wonderdog.id.field - should be set by the storefunc constructor
|
16
|
+
-- wonderdog.object.type - should be set by the storefunc constructor
|
17
|
+
-- wonderdog.plugins.dir - should be set by call to setStoreLocation
|
18
|
+
-- wonderdog.config - should be set by call to setStoreLocation
|
19
|
+
--
|
20
|
+
-- FIXME: options used in the ElasticSearchOutputFormat should NOT be
|
21
|
+
-- namespaced with 'wonderdog'
|
22
|
+
|
23
|
+
%default INDEX 'es_index'
|
24
|
+
%default OBJ 'text_obj'
|
25
|
+
|
26
|
+
|
27
|
+
records = LOAD '$DATA' AS (text_field:chararray);
|
28
|
+
records_with_id = LOAD '$IDDATA' AS (id_field:int, text_field:chararray);
|
29
|
+
|
30
|
+
-- Here we would use the elasticsearch index name as the uri, pass in a
|
31
|
+
-- comma separated list of field names as the first arg, the id field
|
32
|
+
-- as the second arg and the bulk size as the third.
|
33
|
+
--
|
34
|
+
-- and so on.
|
35
|
+
STORE records INTO '$INDEX/$OBJ' USING ElasticSearchStorage('my_text_field', '-1', '1000');
|
36
|
+
|
37
|
+
|
38
|
+
-- but it would be really nice to duplicate what's in WonderDog.java in that,
|
39
|
+
-- should a bulk request fail, the failed records are written to hdfs. The
|
40
|
+
-- user should have some control of this. Also, it should be possible to generate
|
41
|
+
-- the field names directly from the pig schema? (We'd have to be VERY explicit in the
|
42
|
+
-- docs about this as it would be a point of headscratching/swearing...) In this
|
43
|
+
-- case we might have something like:
|
44
|
+
named_records = FOREACH records GENERATE text_field AS text_field_name;
|
45
|
+
STORE records INTO '/path/to/failed_requests' USING ElasticSearchStorage('$INDEX/$OBJ', '-1', '1000');
|
data/pom.xml
ADDED
@@ -0,0 +1,80 @@
|
|
1
|
+
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
2
|
+
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
3
|
+
<modelVersion>4.0.0</modelVersion>
|
4
|
+
|
5
|
+
<groupId>com.infochimps.elasticsearch</groupId>
|
6
|
+
<artifactId>wonderdog</artifactId>
|
7
|
+
<version>1.0-SNAPSHOT</version>
|
8
|
+
<packaging>jar</packaging>
|
9
|
+
|
10
|
+
<name>wonderdog</name>
|
11
|
+
<url>http://maven.apache.org</url>
|
12
|
+
|
13
|
+
<properties>
|
14
|
+
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
15
|
+
</properties>
|
16
|
+
|
17
|
+
<dependencies>
|
18
|
+
<dependency>
|
19
|
+
<groupId>org.elasticsearch</groupId>
|
20
|
+
<artifactId>elasticsearch</artifactId>
|
21
|
+
<version>0.19.8</version>
|
22
|
+
</dependency>
|
23
|
+
|
24
|
+
<dependency>
|
25
|
+
<groupId>org.apache.hadoop</groupId>
|
26
|
+
<artifactId>hadoop-core</artifactId>
|
27
|
+
<version>0.20.2</version>
|
28
|
+
</dependency>
|
29
|
+
|
30
|
+
<dependency>
|
31
|
+
<groupId>org.apache.pig</groupId>
|
32
|
+
<artifactId>pig</artifactId>
|
33
|
+
<version>0.8.0</version>
|
34
|
+
</dependency>
|
35
|
+
|
36
|
+
<dependency>
|
37
|
+
<groupId>org.codehaus.jackson</groupId>
|
38
|
+
<artifactId>jackson-mapper-asl</artifactId>
|
39
|
+
<version>1.5.2</version>
|
40
|
+
</dependency>
|
41
|
+
</dependencies>
|
42
|
+
|
43
|
+
<build>
|
44
|
+
<plugins>
|
45
|
+
<plugin>
|
46
|
+
<groupId>org.apache.maven.plugins</groupId>
|
47
|
+
<artifactId>maven-compiler-plugin</artifactId>
|
48
|
+
<configuration>
|
49
|
+
<source>1.5</source>
|
50
|
+
<target>1.5</target>
|
51
|
+
</configuration>
|
52
|
+
</plugin>
|
53
|
+
</plugins>
|
54
|
+
</build>
|
55
|
+
|
56
|
+
<repositories>
|
57
|
+
<repository>
|
58
|
+
<id>com.cloudera</id>
|
59
|
+
<url>https://repository.cloudera.com/content/repositories/releases</url>
|
60
|
+
</repository>
|
61
|
+
|
62
|
+
<repository>
|
63
|
+
<id>sonatype-releases</id>
|
64
|
+
<url>http://oss.sonatype.org/content/repositories/releases</url>
|
65
|
+
</repository>
|
66
|
+
|
67
|
+
<repository>
|
68
|
+
<id>codehaus</id>
|
69
|
+
<url>http://repository.codehaus.org/org/codehaus</url>
|
70
|
+
<snapshots>
|
71
|
+
<enabled>true</enabled>
|
72
|
+
</snapshots>
|
73
|
+
<releases>
|
74
|
+
<enabled>true</enabled>
|
75
|
+
</releases>
|
76
|
+
</repository>
|
77
|
+
|
78
|
+
</repositories>
|
79
|
+
|
80
|
+
</project>
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
require 'wonderdog'
|
2
|
+
require 'wukong/spec_helpers'
|
3
|
+
require_relative('support/integration_helper')
|
4
|
+
require_relative('support/driver_helper')
|
5
|
+
|
6
|
+
|
7
|
+
RSpec.configure do |config|
|
8
|
+
|
9
|
+
config.before(:each) do
|
10
|
+
@orig_reg = Wukong.registry.show
|
11
|
+
end
|
12
|
+
|
13
|
+
config.after(:each) do
|
14
|
+
Wukong.registry.clear!
|
15
|
+
Wukong.registry.merge!(@orig_reg)
|
16
|
+
end
|
17
|
+
|
18
|
+
include Wukong::SpecHelpers
|
19
|
+
include Wukong::Elasticsearch::IntegrationHelper
|
20
|
+
include Wukong::Elasticsearch::DriverHelper
|
21
|
+
end
|
22
|
+
|
@@ -0,0 +1,15 @@
|
|
1
|
+
module Wukong
|
2
|
+
module Elasticsearch
|
3
|
+
module DriverHelper
|
4
|
+
|
5
|
+
def driver *args
|
6
|
+
params = Elasticsearch.configure(Hadoop.configure(Configliere::Param.new))
|
7
|
+
params.resolve!
|
8
|
+
params.merge!(args.pop) if args.last.is_a?(Hash)
|
9
|
+
Hadoop::Driver.new(params, *args)
|
10
|
+
end
|
11
|
+
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
@@ -0,0 +1,30 @@
|
|
1
|
+
module Wukong
|
2
|
+
module Elasticsearch
|
3
|
+
module IntegrationHelper
|
4
|
+
|
5
|
+
def root
|
6
|
+
@root ||= Pathname.new(File.expand_path('../../..', __FILE__))
|
7
|
+
end
|
8
|
+
|
9
|
+
def lib_dir
|
10
|
+
root.join('lib')
|
11
|
+
end
|
12
|
+
|
13
|
+
def bin_dir
|
14
|
+
root.join('bin')
|
15
|
+
end
|
16
|
+
|
17
|
+
def integration_env
|
18
|
+
{
|
19
|
+
"RUBYLIB" => [lib_dir.to_s, ENV["RUBYLIB"]].compact.join(':')
|
20
|
+
}
|
21
|
+
end
|
22
|
+
|
23
|
+
def integration_cwd
|
24
|
+
root.to_s
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
@@ -0,0 +1,81 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Wukong::Elasticsearch::HadoopInvocationOverride do
|
4
|
+
|
5
|
+
let(:no_es) { driver('regexp', 'count', input: '/tmp/input_file', output: '/tmp/output_file') }
|
6
|
+
let(:es_reader) { driver('regexp', 'count', input: 'es://the_index/the_map', output: '/tmp/output_file') }
|
7
|
+
let(:es_writer) { driver('regexp', 'count', input: '/tmp/input_file', output: 'es:///the_index/the_map') }
|
8
|
+
let(:es_complex) { driver('regexp', 'count', input: 'es://the_index/the_map', output: 'es:///the_index/the_map', es_query: '{"hi": "there"}', es_request_size: 1000, es_index_field: 'ID') }
|
9
|
+
|
10
|
+
context "not interacting with Elasticsearch" do
|
11
|
+
subject { no_es }
|
12
|
+
# input
|
13
|
+
its(:input_paths) { should == '/tmp/input_file' }
|
14
|
+
its(:hadoop_commandline) { should match(%r{-input.*/tmp/input_file}i) }
|
15
|
+
|
16
|
+
# output
|
17
|
+
its(:output_path) { should == '/tmp/output_file' }
|
18
|
+
its(:hadoop_commandline) { should match(%r{-output.*/tmp/output_file}i) }
|
19
|
+
|
20
|
+
# no elasticsearch anything
|
21
|
+
its(:hadoop_commandline) { should_not match(/elasticsearch/i) }
|
22
|
+
end
|
23
|
+
|
24
|
+
context "reading from Elasticsearch" do
|
25
|
+
subject { es_reader }
|
26
|
+
|
27
|
+
# input
|
28
|
+
its(:input_paths) { should match(%r{/user.*wukong.*the_index.*the_map}) }
|
29
|
+
its(:hadoop_commandline) { should match(/-inputformat.*elasticsearch/i) }
|
30
|
+
its(:hadoop_commandline) { should match(%r{-input.*/user.*wukong.*the_index.*the_map}i) }
|
31
|
+
its(:hadoop_commandline) { should match(/-D\s+elasticsearch\.input\.index.*the_index/i) }
|
32
|
+
its(:hadoop_commandline) { should match(/-D\s+elasticsearch\.input\.map.*the_map/i) }
|
33
|
+
|
34
|
+
# output
|
35
|
+
its(:output_path) { should == '/tmp/output_file' }
|
36
|
+
its(:hadoop_commandline) { should_not match(/-outputformat/i) }
|
37
|
+
its(:hadoop_commandline) { should match(%r{-output.*/tmp/output_file}i) }
|
38
|
+
its(:hadoop_commandline) { should_not match(/-D\s+elasticsearch\.output/i) }
|
39
|
+
end
|
40
|
+
|
41
|
+
context "writing to Elasticsearch" do
|
42
|
+
subject { es_writer }
|
43
|
+
|
44
|
+
# input
|
45
|
+
its(:input_paths) { should == '/tmp/input_file' }
|
46
|
+
its(:hadoop_commandline) { should_not match(/-inputformat/i) }
|
47
|
+
its(:hadoop_commandline) { should match(%r{-input.*/tmp/input_file}i) }
|
48
|
+
its(:hadoop_commandline) { should_not match(/-D\s+elasticsearch\.input/i) }
|
49
|
+
|
50
|
+
# output
|
51
|
+
its(:output_path) { should match(%r{/user.*wukong.*the_index.*the_map}) }
|
52
|
+
its(:hadoop_commandline) { should match(/-outputformat.*elasticsearch/i) }
|
53
|
+
its(:hadoop_commandline) { should match(%r{-output.*/user.*wukong.*the_index.*the_map}i) }
|
54
|
+
its(:hadoop_commandline) { should match(/-D\s+elasticsearch\.output\.index.*the_index/i) }
|
55
|
+
its(:hadoop_commandline) { should match(/-D\s+elasticsearch\.output\.map.*the_map/i) }
|
56
|
+
end
|
57
|
+
|
58
|
+
context "reading and writing with many options" do
|
59
|
+
subject { es_complex }
|
60
|
+
|
61
|
+
# input
|
62
|
+
its(:input_paths) { should match(%r{/user.*wukong.*the_index.*the_map}) }
|
63
|
+
its(:hadoop_commandline) { should match(/-inputformat.*elasticsearch/i) }
|
64
|
+
its(:hadoop_commandline) { should match(%r{-input.*/user.*wukong.*the_index.*the_map}i) }
|
65
|
+
its(:hadoop_commandline) { should match(/-D\s+elasticsearch\.input\.index.*the_index/i) }
|
66
|
+
its(:hadoop_commandline) { should match(/-D\s+elasticsearch\.input\.map.*the_map/i) }
|
67
|
+
|
68
|
+
# output
|
69
|
+
its(:output_path) { should match(%r{/user.*wukong.*the_index.*the_map}) }
|
70
|
+
its(:hadoop_commandline) { should match(/-outputformat.*elasticsearch/i) }
|
71
|
+
its(:hadoop_commandline) { should match(%r{-output.*/user.*wukong.*the_index.*the_map}i) }
|
72
|
+
its(:hadoop_commandline) { should match(/-D\s+elasticsearch\.output\.index.*the_index/i) }
|
73
|
+
its(:hadoop_commandline) { should match(/-D\s+elasticsearch\.output\.map.*the_map/i) }
|
74
|
+
|
75
|
+
# options
|
76
|
+
its(:hadoop_commandline) { should match(/-D\s+elasticsearch\.input\.query.*hi.*there/i) }
|
77
|
+
its(:hadoop_commandline) { should match(/-D\s+elasticsearch\.input\.request_size.*1000/i) }
|
78
|
+
its(:hadoop_commandline) { should match(/-D\s+elasticsearch\.output\.index\.field.*ID/i) }
|
79
|
+
end
|
80
|
+
|
81
|
+
end
|
@@ -0,0 +1,73 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Wukong::Elasticsearch::IndexAndMapping do
|
4
|
+
|
5
|
+
subject { Wukong::Elasticsearch::IndexAndMapping }
|
6
|
+
|
7
|
+
let(:filesystem_path) { '/some/path' }
|
8
|
+
let(:filesystem_paths) { '/some/path,/some/other/path' }
|
9
|
+
|
10
|
+
let(:hdfs_path) { 'hdfs://some/hdfs/path' }
|
11
|
+
let(:hdfs_paths) { 'hdfs://some/hdfs/path,hdfs://some/other/hdfs/path' }
|
12
|
+
|
13
|
+
let(:es_index_and_mapping) { 'es://index/mapping' }
|
14
|
+
let(:es_indices_and_mapping) { 'es://index1,index2/mapping' }
|
15
|
+
let(:es_index_and_mappings) { 'es://index/mapping1,mapping2' }
|
16
|
+
let(:es_indices_and_mappings) { 'es://index1,index2/mapping1,mapping2' }
|
17
|
+
|
18
|
+
fails = %w[filesystem_path filesystem_paths hdfs_path hdfs_paths]
|
19
|
+
passes = %w[es_index_and_mapping es_indices_and_mapping es_index_and_mappings es_indices_and_mappings]
|
20
|
+
|
21
|
+
context 'recognizing possible es://index/mapping specifications' do
|
22
|
+
fails.each do |name|
|
23
|
+
it "doesn't recognize a #{name}" do
|
24
|
+
subject.matches?(self.send(name)).should be_false
|
25
|
+
end
|
26
|
+
end
|
27
|
+
passes.each do |name|
|
28
|
+
it "recognizes a #{name}" do
|
29
|
+
subject.matches?(self.send(name)).should be_true
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
context "parsing es://index/mapping specifications" do
|
35
|
+
fails.each do |name|
|
36
|
+
it "raises an error on a #{name}" do
|
37
|
+
lambda { subject.new(self.send(name)) }.should raise_error(Wukong::Error, /not an elasticsearch.*index\/mapping/i)
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
it "raises an error on a specification with too many parts" do
|
42
|
+
lambda { subject.new('es://index/mapping/extra') }.should raise_error(Wukong::Error, /not an elasticsearch.*index\/mapping/i)
|
43
|
+
end
|
44
|
+
|
45
|
+
it "raises an error on a specification with too few parts" do
|
46
|
+
lambda { subject.new('es://') }.should raise_error(Wukong::Error, /not an elasticsearch.*index\/mapping/i)
|
47
|
+
end
|
48
|
+
|
49
|
+
context "on an index and mapping" do
|
50
|
+
subject { Wukong::Elasticsearch::IndexAndMapping.new(es_index_and_mapping) }
|
51
|
+
its(:index) { should == 'index' }
|
52
|
+
its(:mapping) { should == 'mapping' }
|
53
|
+
end
|
54
|
+
context "on indices and a mapping" do
|
55
|
+
subject { Wukong::Elasticsearch::IndexAndMapping.new(es_indices_and_mapping) }
|
56
|
+
its(:index) { should == 'index1,index2' }
|
57
|
+
its(:mapping) { should == 'mapping' }
|
58
|
+
end
|
59
|
+
context "on an index and mappings" do
|
60
|
+
subject { Wukong::Elasticsearch::IndexAndMapping.new(es_index_and_mappings) }
|
61
|
+
its(:index) { should == 'index' }
|
62
|
+
its(:mapping) { should == 'mapping1,mapping2' }
|
63
|
+
end
|
64
|
+
context "on indices and mappings" do
|
65
|
+
subject { Wukong::Elasticsearch::IndexAndMapping.new(es_indices_and_mappings) }
|
66
|
+
its(:index) { should == 'index1,index2' }
|
67
|
+
its(:mapping) { should == 'mapping1,mapping2' }
|
68
|
+
end
|
69
|
+
|
70
|
+
|
71
|
+
end
|
72
|
+
|
73
|
+
end
|
@@ -0,0 +1,268 @@
|
|
1
|
+
package com.infochimps.elasticsearch;
|
2
|
+
|
3
|
+
import java.io.IOException;
|
4
|
+
import java.util.Map;
|
5
|
+
import java.util.List;
|
6
|
+
import java.util.ArrayList;
|
7
|
+
import java.util.Iterator;
|
8
|
+
|
9
|
+
import org.apache.commons.logging.Log;
|
10
|
+
import org.apache.commons.logging.LogFactory;
|
11
|
+
|
12
|
+
import org.apache.hadoop.io.*;
|
13
|
+
import org.apache.hadoop.conf.Configurable;
|
14
|
+
import org.apache.hadoop.conf.Configuration;
|
15
|
+
import org.apache.hadoop.mapreduce.InputFormat;
|
16
|
+
import org.apache.hadoop.mapreduce.RecordReader;
|
17
|
+
import org.apache.hadoop.mapreduce.InputSplit;
|
18
|
+
import org.apache.hadoop.mapreduce.JobContext;
|
19
|
+
import org.apache.hadoop.mapreduce.TaskAttemptContext;
|
20
|
+
|
21
|
+
import com.infochimps.elasticsearch.hadoop.util.HadoopUtils;
|
22
|
+
|
23
|
+
import org.elasticsearch.node.Node;
|
24
|
+
import org.elasticsearch.node.NodeBuilder;
|
25
|
+
import org.elasticsearch.client.Client;
|
26
|
+
import org.elasticsearch.client.Requests;
|
27
|
+
import org.elasticsearch.search.SearchHit;
|
28
|
+
import org.elasticsearch.action.search.SearchResponse;
|
29
|
+
import org.elasticsearch.action.search.SearchType;
|
30
|
+
import org.elasticsearch.index.query.FilterBuilders.*;
|
31
|
+
import org.elasticsearch.index.query.QueryBuilders;
|
32
|
+
|
33
|
+
/**
|
34
|
+
|
35
|
+
A Hadoop InputFormat to read data from an Elasticsearch index. The RecordReader
|
36
|
+
divulges records where the key is the record id in elasticsearch and the value
|
37
|
+
is a json string of the (source) record contents.
|
38
|
+
|
39
|
+
*/
|
40
|
+
public class ElasticSearchInputFormat extends InputFormat<Text, Text> implements Configurable {
|
41
|
+
|
42
|
+
static Log LOG = LogFactory.getLog(ElasticSearchInputFormat.class);
|
43
|
+
private Configuration conf = null;
|
44
|
+
|
45
|
+
private Node node;
|
46
|
+
private Client client;
|
47
|
+
|
48
|
+
private Integer requestSize;
|
49
|
+
private Long numHits;
|
50
|
+
private Long numSplits;
|
51
|
+
private Long numSplitRecords;
|
52
|
+
private String indexName;
|
53
|
+
private String objType;
|
54
|
+
private String queryString;
|
55
|
+
|
56
|
+
private static final String ES_REQUEST_SIZE = "elasticsearch.request.size"; // number of records to fetch at one time
|
57
|
+
private static final String ES_NUM_SPLITS = "elasticsearch.num.input.splits"; // number of hadoop map tasks to launch
|
58
|
+
private static final String ES_QUERY_STRING = "elasticsearch.query.string";
|
59
|
+
|
60
|
+
private static final String ES_CONFIG_NAME = "elasticsearch.yml";
|
61
|
+
private static final String ES_PLUGINS_NAME = "plugins";
|
62
|
+
private static final String ES_INDEX_NAME = "elasticsearch.index.name";
|
63
|
+
private static final String ES_OBJECT_TYPE = "elasticsearch.object.type";
|
64
|
+
private static final String ES_CONFIG = "es.config";
|
65
|
+
private static final String ES_PLUGINS = "es.path.plugins";
|
66
|
+
private static final String SLASH = "/";
|
67
|
+
|
68
|
+
public RecordReader<Text,Text> createRecordReader(InputSplit inputSplit,
|
69
|
+
TaskAttemptContext context) {
|
70
|
+
return new ElasticSearchRecordReader();
|
71
|
+
}
|
72
|
+
|
73
|
+
/**
|
74
|
+
The number of splits is specified in the Hadoop configuration object.
|
75
|
+
*/
|
76
|
+
public List<InputSplit> getSplits(JobContext context) {
|
77
|
+
setConf(context.getConfiguration());
|
78
|
+
List<InputSplit> splits = new ArrayList<InputSplit>(numSplits.intValue());
|
79
|
+
for(int i = 0; i < numSplits; i++) {
|
80
|
+
Long size = (numSplitRecords == 1) ? 1 : numSplitRecords-1;
|
81
|
+
splits.add(new ElasticSearchSplit(queryString, i*numSplitRecords, size));
|
82
|
+
}
|
83
|
+
if (numHits % numSplits > 0) splits.add(new ElasticSearchSplit(queryString, numSplits*numSplitRecords, numHits % numSplits - 1));
|
84
|
+
LOG.info("Created ["+splits.size()+"] splits for ["+numHits+"] hits");
|
85
|
+
return splits;
|
86
|
+
}
|
87
|
+
|
88
|
+
/**
|
89
|
+
Sets the configuration object, opens a connection to elasticsearch, and
|
90
|
+
initiates the initial search request.
|
91
|
+
*/
|
92
|
+
@Override
|
93
|
+
public void setConf(Configuration configuration) {
|
94
|
+
this.conf = configuration;
|
95
|
+
this.indexName = conf.get(ES_INDEX_NAME);
|
96
|
+
this.objType = conf.get(ES_OBJECT_TYPE);
|
97
|
+
this.requestSize = Integer.parseInt(conf.get(ES_REQUEST_SIZE));
|
98
|
+
this.numSplits = Long.parseLong(conf.get(ES_NUM_SPLITS));
|
99
|
+
this.queryString = conf.get(ES_QUERY_STRING);
|
100
|
+
|
101
|
+
//
|
102
|
+
// Need to ensure that this is set in the hadoop configuration so we can
|
103
|
+
// instantiate a local client. The reason is that no files are in the
|
104
|
+
// distributed cache when this is called.
|
105
|
+
//
|
106
|
+
System.setProperty(ES_CONFIG, conf.get(ES_CONFIG));
|
107
|
+
System.setProperty(ES_PLUGINS, conf.get(ES_PLUGINS));
|
108
|
+
|
109
|
+
start_embedded_client();
|
110
|
+
|
111
|
+
initiate_search();
|
112
|
+
}
|
113
|
+
|
114
|
+
@Override
|
115
|
+
public Configuration getConf() {
|
116
|
+
return conf;
|
117
|
+
}
|
118
|
+
|
119
|
+
/**
|
120
|
+
Starts an embedded elasticsearch client (ie. data = false)
|
121
|
+
*/
|
122
|
+
private void start_embedded_client() {
|
123
|
+
LOG.info("Starting embedded elasticsearch client ...");
|
124
|
+
this.node = NodeBuilder.nodeBuilder().client(true).node();
|
125
|
+
this.client = node.client();
|
126
|
+
}
|
127
|
+
|
128
|
+
private void initiate_search() {
|
129
|
+
SearchResponse response = client.prepareSearch(indexName)
|
130
|
+
.setTypes(objType)
|
131
|
+
.setSearchType(SearchType.COUNT)
|
132
|
+
.setQuery(QueryBuilders.queryString(queryString))
|
133
|
+
.setSize(requestSize)
|
134
|
+
.execute()
|
135
|
+
.actionGet();
|
136
|
+
this.numHits = response.hits().totalHits();
|
137
|
+
if(numSplits > numHits) numSplits = numHits; // This could be bad
|
138
|
+
this.numSplitRecords = (numHits/numSplits);
|
139
|
+
}
|
140
|
+
|
141
|
+
protected class ElasticSearchRecordReader extends RecordReader<Text, Text> {
|
142
|
+
|
143
|
+
private Node node;
|
144
|
+
private Client client;
|
145
|
+
|
146
|
+
private String indexName;
|
147
|
+
private String objType;
|
148
|
+
private Long numSplitRecords;
|
149
|
+
private Integer requestSize;
|
150
|
+
private Text currentKey;
|
151
|
+
private Text currentValue;
|
152
|
+
private Integer recordsRead;
|
153
|
+
private Iterator<SearchHit> hitsItr = null;
|
154
|
+
|
155
|
+
|
156
|
+
private String queryString;
|
157
|
+
private Long from;
|
158
|
+
private Long recsToRead;
|
159
|
+
|
160
|
+
public ElasticSearchRecordReader() {
|
161
|
+
}
|
162
|
+
|
163
|
+
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException {
|
164
|
+
Configuration conf = context.getConfiguration();
|
165
|
+
this.indexName = conf.get(ES_INDEX_NAME);
|
166
|
+
this.objType = conf.get(ES_OBJECT_TYPE);
|
167
|
+
LOG.info("Initializing elasticsearch record reader on index ["+indexName+"] and object type ["+objType+"]");
|
168
|
+
|
169
|
+
//
|
170
|
+
// Fetches elasticsearch.yml and the plugins directory from the distributed cache
|
171
|
+
//
|
172
|
+
try {
|
173
|
+
String taskConfigPath = HadoopUtils.fetchFileFromCache(ES_CONFIG_NAME, conf);
|
174
|
+
LOG.info("Using ["+taskConfigPath+"] as es.config");
|
175
|
+
String taskPluginsPath = HadoopUtils.fetchArchiveFromCache(ES_PLUGINS_NAME, conf);
|
176
|
+
LOG.info("Using ["+taskPluginsPath+"] as es.plugins.dir");
|
177
|
+
System.setProperty(ES_CONFIG, taskConfigPath);
|
178
|
+
System.setProperty(ES_PLUGINS, taskPluginsPath+SLASH+ES_PLUGINS_NAME);
|
179
|
+
} catch (Exception e) {
|
180
|
+
throw new RuntimeException(e);
|
181
|
+
}
|
182
|
+
|
183
|
+
queryString = ((ElasticSearchSplit)split).getQueryString();
|
184
|
+
from = ((ElasticSearchSplit)split).getFrom();
|
185
|
+
recsToRead = ((ElasticSearchSplit)split).getSize();
|
186
|
+
|
187
|
+
LOG.info("elasticsearch record reader: query ["+queryString+"], from ["+from+"], size ["+recsToRead+"]");
|
188
|
+
start_embedded_client();
|
189
|
+
recordsRead = 0;
|
190
|
+
}
|
191
|
+
|
192
|
+
/**
|
193
|
+
Starts an embedded elasticsearch client (ie. data = false)
|
194
|
+
*/
|
195
|
+
private void start_embedded_client() {
|
196
|
+
LOG.info("Starting embedded elasticsearch client ...");
|
197
|
+
this.node = NodeBuilder.nodeBuilder().client(true).node();
|
198
|
+
this.client = node.client();
|
199
|
+
}
|
200
|
+
|
201
|
+
private Iterator<SearchHit> fetchNextHits() {
|
202
|
+
SearchResponse response = client.prepareSearch(indexName)
|
203
|
+
.setTypes(objType)
|
204
|
+
.setFrom(from.intValue())
|
205
|
+
.setSize(recsToRead.intValue())
|
206
|
+
.setQuery(QueryBuilders.queryString(queryString))
|
207
|
+
.execute()
|
208
|
+
.actionGet();
|
209
|
+
return response.hits().iterator();
|
210
|
+
}
|
211
|
+
|
212
|
+
@Override
|
213
|
+
public boolean nextKeyValue() throws IOException {
|
214
|
+
if (hitsItr!=null) {
|
215
|
+
if (recordsRead < recsToRead) {
|
216
|
+
if (hitsItr.hasNext()) {
|
217
|
+
SearchHit hit = hitsItr.next();
|
218
|
+
currentKey = new Text(hit.id());
|
219
|
+
currentValue = new Text(hit.sourceAsString());
|
220
|
+
recordsRead += 1;
|
221
|
+
return true;
|
222
|
+
}
|
223
|
+
} else {
|
224
|
+
hitsItr = null;
|
225
|
+
}
|
226
|
+
} else {
|
227
|
+
if (recordsRead < recsToRead) {
|
228
|
+
hitsItr = fetchNextHits();
|
229
|
+
if (hitsItr.hasNext()) {
|
230
|
+
SearchHit hit = hitsItr.next();
|
231
|
+
currentKey = new Text(hit.id());
|
232
|
+
currentValue = new Text(hit.sourceAsString());
|
233
|
+
recordsRead += 1;
|
234
|
+
return true;
|
235
|
+
}
|
236
|
+
}
|
237
|
+
}
|
238
|
+
return false;
|
239
|
+
}
|
240
|
+
|
241
|
+
@Override
|
242
|
+
public Text getCurrentKey() {
|
243
|
+
return currentKey;
|
244
|
+
}
|
245
|
+
|
246
|
+
@Override
|
247
|
+
public Text getCurrentValue() {
|
248
|
+
return currentValue;
|
249
|
+
}
|
250
|
+
|
251
|
+
@Override
|
252
|
+
public float getProgress() throws IOException {
|
253
|
+
return 0;
|
254
|
+
}
|
255
|
+
|
256
|
+
@Override
|
257
|
+
public void close() throws IOException {
|
258
|
+
LOG.info("Closing record reader");
|
259
|
+
client.close();
|
260
|
+
LOG.info("Client is closed");
|
261
|
+
if (node != null) {
|
262
|
+
node.close();
|
263
|
+
}
|
264
|
+
LOG.info("Record reader closed.");
|
265
|
+
}
|
266
|
+
|
267
|
+
}
|
268
|
+
}
|