wonderdog 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +49 -0
- data/.rspec +2 -0
- data/CHANGELOG.md +5 -0
- data/LICENSE.md +201 -0
- data/README.md +175 -0
- data/Rakefile +10 -0
- data/bin/estool +141 -0
- data/bin/estrus.rb +136 -0
- data/bin/wonderdog +93 -0
- data/config/elasticsearch-example.yml +227 -0
- data/config/elasticsearch.in.sh +52 -0
- data/config/logging.yml +43 -0
- data/config/more_settings.yml +60 -0
- data/config/run_elasticsearch-2.sh +42 -0
- data/config/ufo_config.json +12 -0
- data/lib/wonderdog.rb +14 -0
- data/lib/wonderdog/configuration.rb +25 -0
- data/lib/wonderdog/hadoop_invocation_override.rb +139 -0
- data/lib/wonderdog/index_and_mapping.rb +67 -0
- data/lib/wonderdog/timestamp.rb +43 -0
- data/lib/wonderdog/version.rb +3 -0
- data/notes/README-benchmarking.txt +272 -0
- data/notes/README-read_tuning.textile +74 -0
- data/notes/benchmarking-201011.numbers +0 -0
- data/notes/cluster_notes.md +17 -0
- data/notes/notes.txt +91 -0
- data/notes/pigstorefunc.pig +45 -0
- data/pom.xml +80 -0
- data/spec/spec_helper.rb +22 -0
- data/spec/support/driver_helper.rb +15 -0
- data/spec/support/integration_helper.rb +30 -0
- data/spec/wonderdog/hadoop_invocation_override_spec.rb +81 -0
- data/spec/wonderdog/index_and_type_spec.rb +73 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchInputFormat.java +268 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchOutputCommitter.java +39 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchOutputFormat.java +283 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchSplit.java +60 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingInputFormat.java +231 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingOutputCommitter.java +37 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingOutputFormat.java +88 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingRecordReader.java +176 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingRecordWriter.java +171 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingSplit.java +102 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticTest.java +108 -0
- data/src/main/java/com/infochimps/elasticsearch/hadoop/util/HadoopUtils.java +100 -0
- data/src/main/java/com/infochimps/elasticsearch/pig/ElasticSearchIndex.java +216 -0
- data/src/main/java/com/infochimps/elasticsearch/pig/ElasticSearchJsonIndex.java +235 -0
- data/src/main/java/com/infochimps/elasticsearch/pig/ElasticSearchStorage.java +355 -0
- data/test/foo.json +3 -0
- data/test/foo.tsv +3 -0
- data/test/test_dump.pig +19 -0
- data/test/test_json_loader.pig +21 -0
- data/test/test_tsv_loader.pig +16 -0
- data/wonderdog.gemspec +32 -0
- metadata +130 -0
@@ -0,0 +1,45 @@
|
|
1
|
+
--
|
2
|
+
-- Doesn't work at the moment, just some notes on how the storefunc might look.
|
3
|
+
--
|
4
|
+
|
5
|
+
|
6
|
+
--
|
7
|
+
-- Right now the ElasticSearchOutputFormat gets all its options from the
|
8
|
+
-- Job object. We can use the call to setStoreLocation in the storefunc
|
9
|
+
-- to set the required parameters. Need to make sure the following are
|
10
|
+
-- set:
|
11
|
+
--
|
12
|
+
-- wonderdog.index.name - should be set by the storefunc constructor
|
13
|
+
-- wonderdog.bulk.size - should be set by the storefunc constructor
|
14
|
+
-- wonderdog.field.names - should be set by the call to checkSchema
|
15
|
+
-- wonderdog.id.field - should be set by the storefunc constructor
|
16
|
+
-- wonderdog.object.type - should be set by the storefunc constructor
|
17
|
+
-- wonderdog.plugins.dir - should be set by call to setStoreLocation
|
18
|
+
-- wonderdog.config - should be set by call to setStoreLocation
|
19
|
+
--
|
20
|
+
-- FIXME: options used in the ElasticSearchOutputFormat should NOT be
|
21
|
+
-- namespaced with 'wonderdog'
|
22
|
+
|
23
|
+
%default INDEX 'es_index'
|
24
|
+
%default OBJ 'text_obj'
|
25
|
+
|
26
|
+
|
27
|
+
records = LOAD '$DATA' AS (text_field:chararray);
|
28
|
+
records_with_id = LOAD '$IDDATA' AS (id_field:int, text_field:chararray);
|
29
|
+
|
30
|
+
-- Here we would use the elasticsearch index name as the uri, pass in a
|
31
|
+
-- comma separated list of field names as the first arg, the id field
|
32
|
+
-- as the second arg and the bulk size as the third.
|
33
|
+
--
|
34
|
+
-- and so on.
|
35
|
+
STORE records INTO '$INDEX/$OBJ' USING ElasticSearchStorage('my_text_field', '-1', '1000');
|
36
|
+
|
37
|
+
|
38
|
+
-- but it would be really nice to duplicate what's in WonderDog.java in that,
|
39
|
+
-- should a bulk request fail, the failed records are written to hdfs. The
|
40
|
+
-- user should have some control of this. Also, it should be possible to generate
|
41
|
+
-- the field names directly from the pig schema? (We'd have to be VERY explicit in the
|
42
|
+
-- docs about this as it would be a point of headscratching/swearing...) In this
|
43
|
+
-- case we might have something like:
|
44
|
+
named_records = FOREACH records GENERATE text_field AS text_field_name;
|
45
|
+
STORE records INTO '/path/to/failed_requests' USING ElasticSearchStorage('$INDEX/$OBJ', '-1', '1000');
|
data/pom.xml
ADDED
@@ -0,0 +1,80 @@
|
|
1
|
+
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
2
|
+
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
3
|
+
<modelVersion>4.0.0</modelVersion>
|
4
|
+
|
5
|
+
<groupId>com.infochimps.elasticsearch</groupId>
|
6
|
+
<artifactId>wonderdog</artifactId>
|
7
|
+
<version>1.0-SNAPSHOT</version>
|
8
|
+
<packaging>jar</packaging>
|
9
|
+
|
10
|
+
<name>wonderdog</name>
|
11
|
+
<url>http://maven.apache.org</url>
|
12
|
+
|
13
|
+
<properties>
|
14
|
+
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
15
|
+
</properties>
|
16
|
+
|
17
|
+
<dependencies>
|
18
|
+
<dependency>
|
19
|
+
<groupId>org.elasticsearch</groupId>
|
20
|
+
<artifactId>elasticsearch</artifactId>
|
21
|
+
<version>0.19.8</version>
|
22
|
+
</dependency>
|
23
|
+
|
24
|
+
<dependency>
|
25
|
+
<groupId>org.apache.hadoop</groupId>
|
26
|
+
<artifactId>hadoop-core</artifactId>
|
27
|
+
<version>0.20.2</version>
|
28
|
+
</dependency>
|
29
|
+
|
30
|
+
<dependency>
|
31
|
+
<groupId>org.apache.pig</groupId>
|
32
|
+
<artifactId>pig</artifactId>
|
33
|
+
<version>0.8.0</version>
|
34
|
+
</dependency>
|
35
|
+
|
36
|
+
<dependency>
|
37
|
+
<groupId>org.codehaus.jackson</groupId>
|
38
|
+
<artifactId>jackson-mapper-asl</artifactId>
|
39
|
+
<version>1.5.2</version>
|
40
|
+
</dependency>
|
41
|
+
</dependencies>
|
42
|
+
|
43
|
+
<build>
|
44
|
+
<plugins>
|
45
|
+
<plugin>
|
46
|
+
<groupId>org.apache.maven.plugins</groupId>
|
47
|
+
<artifactId>maven-compiler-plugin</artifactId>
|
48
|
+
<configuration>
|
49
|
+
<source>1.5</source>
|
50
|
+
<target>1.5</target>
|
51
|
+
</configuration>
|
52
|
+
</plugin>
|
53
|
+
</plugins>
|
54
|
+
</build>
|
55
|
+
|
56
|
+
<repositories>
|
57
|
+
<repository>
|
58
|
+
<id>com.cloudera</id>
|
59
|
+
<url>https://repository.cloudera.com/content/repositories/releases</url>
|
60
|
+
</repository>
|
61
|
+
|
62
|
+
<repository>
|
63
|
+
<id>sonatype-releases</id>
|
64
|
+
<url>http://oss.sonatype.org/content/repositories/releases</url>
|
65
|
+
</repository>
|
66
|
+
|
67
|
+
<repository>
|
68
|
+
<id>codehaus</id>
|
69
|
+
<url>http://repository.codehaus.org/org/codehaus</url>
|
70
|
+
<snapshots>
|
71
|
+
<enabled>true</enabled>
|
72
|
+
</snapshots>
|
73
|
+
<releases>
|
74
|
+
<enabled>true</enabled>
|
75
|
+
</releases>
|
76
|
+
</repository>
|
77
|
+
|
78
|
+
</repositories>
|
79
|
+
|
80
|
+
</project>
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
require 'wonderdog'
|
2
|
+
require 'wukong/spec_helpers'
|
3
|
+
require_relative('support/integration_helper')
|
4
|
+
require_relative('support/driver_helper')
|
5
|
+
|
6
|
+
|
7
|
+
RSpec.configure do |config|
|
8
|
+
|
9
|
+
config.before(:each) do
|
10
|
+
@orig_reg = Wukong.registry.show
|
11
|
+
end
|
12
|
+
|
13
|
+
config.after(:each) do
|
14
|
+
Wukong.registry.clear!
|
15
|
+
Wukong.registry.merge!(@orig_reg)
|
16
|
+
end
|
17
|
+
|
18
|
+
include Wukong::SpecHelpers
|
19
|
+
include Wukong::Elasticsearch::IntegrationHelper
|
20
|
+
include Wukong::Elasticsearch::DriverHelper
|
21
|
+
end
|
22
|
+
|
@@ -0,0 +1,15 @@
|
|
1
|
+
module Wukong
|
2
|
+
module Elasticsearch
|
3
|
+
module DriverHelper
|
4
|
+
|
5
|
+
def driver *args
|
6
|
+
params = Elasticsearch.configure(Hadoop.configure(Configliere::Param.new))
|
7
|
+
params.resolve!
|
8
|
+
params.merge!(args.pop) if args.last.is_a?(Hash)
|
9
|
+
Hadoop::Driver.new(params, *args)
|
10
|
+
end
|
11
|
+
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
@@ -0,0 +1,30 @@
|
|
1
|
+
module Wukong
|
2
|
+
module Elasticsearch
|
3
|
+
module IntegrationHelper
|
4
|
+
|
5
|
+
def root
|
6
|
+
@root ||= Pathname.new(File.expand_path('../../..', __FILE__))
|
7
|
+
end
|
8
|
+
|
9
|
+
def lib_dir
|
10
|
+
root.join('lib')
|
11
|
+
end
|
12
|
+
|
13
|
+
def bin_dir
|
14
|
+
root.join('bin')
|
15
|
+
end
|
16
|
+
|
17
|
+
def integration_env
|
18
|
+
{
|
19
|
+
"RUBYLIB" => [lib_dir.to_s, ENV["RUBYLIB"]].compact.join(':')
|
20
|
+
}
|
21
|
+
end
|
22
|
+
|
23
|
+
def integration_cwd
|
24
|
+
root.to_s
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
@@ -0,0 +1,81 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Wukong::Elasticsearch::HadoopInvocationOverride do
|
4
|
+
|
5
|
+
let(:no_es) { driver('regexp', 'count', input: '/tmp/input_file', output: '/tmp/output_file') }
|
6
|
+
let(:es_reader) { driver('regexp', 'count', input: 'es://the_index/the_map', output: '/tmp/output_file') }
|
7
|
+
let(:es_writer) { driver('regexp', 'count', input: '/tmp/input_file', output: 'es:///the_index/the_map') }
|
8
|
+
let(:es_complex) { driver('regexp', 'count', input: 'es://the_index/the_map', output: 'es:///the_index/the_map', es_query: '{"hi": "there"}', es_request_size: 1000, es_index_field: 'ID') }
|
9
|
+
|
10
|
+
context "not interacting with Elasticsearch" do
|
11
|
+
subject { no_es }
|
12
|
+
# input
|
13
|
+
its(:input_paths) { should == '/tmp/input_file' }
|
14
|
+
its(:hadoop_commandline) { should match(%r{-input.*/tmp/input_file}i) }
|
15
|
+
|
16
|
+
# output
|
17
|
+
its(:output_path) { should == '/tmp/output_file' }
|
18
|
+
its(:hadoop_commandline) { should match(%r{-output.*/tmp/output_file}i) }
|
19
|
+
|
20
|
+
# no elasticsearch anything
|
21
|
+
its(:hadoop_commandline) { should_not match(/elasticsearch/i) }
|
22
|
+
end
|
23
|
+
|
24
|
+
context "reading from Elasticsearch" do
|
25
|
+
subject { es_reader }
|
26
|
+
|
27
|
+
# input
|
28
|
+
its(:input_paths) { should match(%r{/user.*wukong.*the_index.*the_map}) }
|
29
|
+
its(:hadoop_commandline) { should match(/-inputformat.*elasticsearch/i) }
|
30
|
+
its(:hadoop_commandline) { should match(%r{-input.*/user.*wukong.*the_index.*the_map}i) }
|
31
|
+
its(:hadoop_commandline) { should match(/-D\s+elasticsearch\.input\.index.*the_index/i) }
|
32
|
+
its(:hadoop_commandline) { should match(/-D\s+elasticsearch\.input\.map.*the_map/i) }
|
33
|
+
|
34
|
+
# output
|
35
|
+
its(:output_path) { should == '/tmp/output_file' }
|
36
|
+
its(:hadoop_commandline) { should_not match(/-outputformat/i) }
|
37
|
+
its(:hadoop_commandline) { should match(%r{-output.*/tmp/output_file}i) }
|
38
|
+
its(:hadoop_commandline) { should_not match(/-D\s+elasticsearch\.output/i) }
|
39
|
+
end
|
40
|
+
|
41
|
+
context "writing to Elasticsearch" do
|
42
|
+
subject { es_writer }
|
43
|
+
|
44
|
+
# input
|
45
|
+
its(:input_paths) { should == '/tmp/input_file' }
|
46
|
+
its(:hadoop_commandline) { should_not match(/-inputformat/i) }
|
47
|
+
its(:hadoop_commandline) { should match(%r{-input.*/tmp/input_file}i) }
|
48
|
+
its(:hadoop_commandline) { should_not match(/-D\s+elasticsearch\.input/i) }
|
49
|
+
|
50
|
+
# output
|
51
|
+
its(:output_path) { should match(%r{/user.*wukong.*the_index.*the_map}) }
|
52
|
+
its(:hadoop_commandline) { should match(/-outputformat.*elasticsearch/i) }
|
53
|
+
its(:hadoop_commandline) { should match(%r{-output.*/user.*wukong.*the_index.*the_map}i) }
|
54
|
+
its(:hadoop_commandline) { should match(/-D\s+elasticsearch\.output\.index.*the_index/i) }
|
55
|
+
its(:hadoop_commandline) { should match(/-D\s+elasticsearch\.output\.map.*the_map/i) }
|
56
|
+
end
|
57
|
+
|
58
|
+
context "reading and writing with many options" do
|
59
|
+
subject { es_complex }
|
60
|
+
|
61
|
+
# input
|
62
|
+
its(:input_paths) { should match(%r{/user.*wukong.*the_index.*the_map}) }
|
63
|
+
its(:hadoop_commandline) { should match(/-inputformat.*elasticsearch/i) }
|
64
|
+
its(:hadoop_commandline) { should match(%r{-input.*/user.*wukong.*the_index.*the_map}i) }
|
65
|
+
its(:hadoop_commandline) { should match(/-D\s+elasticsearch\.input\.index.*the_index/i) }
|
66
|
+
its(:hadoop_commandline) { should match(/-D\s+elasticsearch\.input\.map.*the_map/i) }
|
67
|
+
|
68
|
+
# output
|
69
|
+
its(:output_path) { should match(%r{/user.*wukong.*the_index.*the_map}) }
|
70
|
+
its(:hadoop_commandline) { should match(/-outputformat.*elasticsearch/i) }
|
71
|
+
its(:hadoop_commandline) { should match(%r{-output.*/user.*wukong.*the_index.*the_map}i) }
|
72
|
+
its(:hadoop_commandline) { should match(/-D\s+elasticsearch\.output\.index.*the_index/i) }
|
73
|
+
its(:hadoop_commandline) { should match(/-D\s+elasticsearch\.output\.map.*the_map/i) }
|
74
|
+
|
75
|
+
# options
|
76
|
+
its(:hadoop_commandline) { should match(/-D\s+elasticsearch\.input\.query.*hi.*there/i) }
|
77
|
+
its(:hadoop_commandline) { should match(/-D\s+elasticsearch\.input\.request_size.*1000/i) }
|
78
|
+
its(:hadoop_commandline) { should match(/-D\s+elasticsearch\.output\.index\.field.*ID/i) }
|
79
|
+
end
|
80
|
+
|
81
|
+
end
|
@@ -0,0 +1,73 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Wukong::Elasticsearch::IndexAndMapping do
|
4
|
+
|
5
|
+
subject { Wukong::Elasticsearch::IndexAndMapping }
|
6
|
+
|
7
|
+
let(:filesystem_path) { '/some/path' }
|
8
|
+
let(:filesystem_paths) { '/some/path,/some/other/path' }
|
9
|
+
|
10
|
+
let(:hdfs_path) { 'hdfs://some/hdfs/path' }
|
11
|
+
let(:hdfs_paths) { 'hdfs://some/hdfs/path,hdfs://some/other/hdfs/path' }
|
12
|
+
|
13
|
+
let(:es_index_and_mapping) { 'es://index/mapping' }
|
14
|
+
let(:es_indices_and_mapping) { 'es://index1,index2/mapping' }
|
15
|
+
let(:es_index_and_mappings) { 'es://index/mapping1,mapping2' }
|
16
|
+
let(:es_indices_and_mappings) { 'es://index1,index2/mapping1,mapping2' }
|
17
|
+
|
18
|
+
fails = %w[filesystem_path filesystem_paths hdfs_path hdfs_paths]
|
19
|
+
passes = %w[es_index_and_mapping es_indices_and_mapping es_index_and_mappings es_indices_and_mappings]
|
20
|
+
|
21
|
+
context 'recognizing possible es://index/mapping specifications' do
|
22
|
+
fails.each do |name|
|
23
|
+
it "doesn't recognize a #{name}" do
|
24
|
+
subject.matches?(self.send(name)).should be_false
|
25
|
+
end
|
26
|
+
end
|
27
|
+
passes.each do |name|
|
28
|
+
it "recognizes a #{name}" do
|
29
|
+
subject.matches?(self.send(name)).should be_true
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
context "parsing es://index/mapping specifications" do
|
35
|
+
fails.each do |name|
|
36
|
+
it "raises an error on a #{name}" do
|
37
|
+
lambda { subject.new(self.send(name)) }.should raise_error(Wukong::Error, /not an elasticsearch.*index\/mapping/i)
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
it "raises an error on a specification with too many parts" do
|
42
|
+
lambda { subject.new('es://index/mapping/extra') }.should raise_error(Wukong::Error, /not an elasticsearch.*index\/mapping/i)
|
43
|
+
end
|
44
|
+
|
45
|
+
it "raises an error on a specification with too few parts" do
|
46
|
+
lambda { subject.new('es://') }.should raise_error(Wukong::Error, /not an elasticsearch.*index\/mapping/i)
|
47
|
+
end
|
48
|
+
|
49
|
+
context "on an index and mapping" do
|
50
|
+
subject { Wukong::Elasticsearch::IndexAndMapping.new(es_index_and_mapping) }
|
51
|
+
its(:index) { should == 'index' }
|
52
|
+
its(:mapping) { should == 'mapping' }
|
53
|
+
end
|
54
|
+
context "on indices and a mapping" do
|
55
|
+
subject { Wukong::Elasticsearch::IndexAndMapping.new(es_indices_and_mapping) }
|
56
|
+
its(:index) { should == 'index1,index2' }
|
57
|
+
its(:mapping) { should == 'mapping' }
|
58
|
+
end
|
59
|
+
context "on an index and mappings" do
|
60
|
+
subject { Wukong::Elasticsearch::IndexAndMapping.new(es_index_and_mappings) }
|
61
|
+
its(:index) { should == 'index' }
|
62
|
+
its(:mapping) { should == 'mapping1,mapping2' }
|
63
|
+
end
|
64
|
+
context "on indices and mappings" do
|
65
|
+
subject { Wukong::Elasticsearch::IndexAndMapping.new(es_indices_and_mappings) }
|
66
|
+
its(:index) { should == 'index1,index2' }
|
67
|
+
its(:mapping) { should == 'mapping1,mapping2' }
|
68
|
+
end
|
69
|
+
|
70
|
+
|
71
|
+
end
|
72
|
+
|
73
|
+
end
|
@@ -0,0 +1,268 @@
|
|
1
|
+
package com.infochimps.elasticsearch;
|
2
|
+
|
3
|
+
import java.io.IOException;
|
4
|
+
import java.util.Map;
|
5
|
+
import java.util.List;
|
6
|
+
import java.util.ArrayList;
|
7
|
+
import java.util.Iterator;
|
8
|
+
|
9
|
+
import org.apache.commons.logging.Log;
|
10
|
+
import org.apache.commons.logging.LogFactory;
|
11
|
+
|
12
|
+
import org.apache.hadoop.io.*;
|
13
|
+
import org.apache.hadoop.conf.Configurable;
|
14
|
+
import org.apache.hadoop.conf.Configuration;
|
15
|
+
import org.apache.hadoop.mapreduce.InputFormat;
|
16
|
+
import org.apache.hadoop.mapreduce.RecordReader;
|
17
|
+
import org.apache.hadoop.mapreduce.InputSplit;
|
18
|
+
import org.apache.hadoop.mapreduce.JobContext;
|
19
|
+
import org.apache.hadoop.mapreduce.TaskAttemptContext;
|
20
|
+
|
21
|
+
import com.infochimps.elasticsearch.hadoop.util.HadoopUtils;
|
22
|
+
|
23
|
+
import org.elasticsearch.node.Node;
|
24
|
+
import org.elasticsearch.node.NodeBuilder;
|
25
|
+
import org.elasticsearch.client.Client;
|
26
|
+
import org.elasticsearch.client.Requests;
|
27
|
+
import org.elasticsearch.search.SearchHit;
|
28
|
+
import org.elasticsearch.action.search.SearchResponse;
|
29
|
+
import org.elasticsearch.action.search.SearchType;
|
30
|
+
import org.elasticsearch.index.query.FilterBuilders.*;
|
31
|
+
import org.elasticsearch.index.query.QueryBuilders;
|
32
|
+
|
33
|
+
/**
|
34
|
+
|
35
|
+
A Hadoop InputFormat to read data from an Elasticsearch index. The RecordReader
|
36
|
+
divulges records where the key is the record id in elasticsearch and the value
|
37
|
+
is a json string of the (source) record contents.
|
38
|
+
|
39
|
+
*/
|
40
|
+
public class ElasticSearchInputFormat extends InputFormat<Text, Text> implements Configurable {
|
41
|
+
|
42
|
+
static Log LOG = LogFactory.getLog(ElasticSearchInputFormat.class);
|
43
|
+
private Configuration conf = null;
|
44
|
+
|
45
|
+
private Node node;
|
46
|
+
private Client client;
|
47
|
+
|
48
|
+
private Integer requestSize;
|
49
|
+
private Long numHits;
|
50
|
+
private Long numSplits;
|
51
|
+
private Long numSplitRecords;
|
52
|
+
private String indexName;
|
53
|
+
private String objType;
|
54
|
+
private String queryString;
|
55
|
+
|
56
|
+
private static final String ES_REQUEST_SIZE = "elasticsearch.request.size"; // number of records to fetch at one time
|
57
|
+
private static final String ES_NUM_SPLITS = "elasticsearch.num.input.splits"; // number of hadoop map tasks to launch
|
58
|
+
private static final String ES_QUERY_STRING = "elasticsearch.query.string";
|
59
|
+
|
60
|
+
private static final String ES_CONFIG_NAME = "elasticsearch.yml";
|
61
|
+
private static final String ES_PLUGINS_NAME = "plugins";
|
62
|
+
private static final String ES_INDEX_NAME = "elasticsearch.index.name";
|
63
|
+
private static final String ES_OBJECT_TYPE = "elasticsearch.object.type";
|
64
|
+
private static final String ES_CONFIG = "es.config";
|
65
|
+
private static final String ES_PLUGINS = "es.path.plugins";
|
66
|
+
private static final String SLASH = "/";
|
67
|
+
|
68
|
+
public RecordReader<Text,Text> createRecordReader(InputSplit inputSplit,
|
69
|
+
TaskAttemptContext context) {
|
70
|
+
return new ElasticSearchRecordReader();
|
71
|
+
}
|
72
|
+
|
73
|
+
/**
|
74
|
+
The number of splits is specified in the Hadoop configuration object.
|
75
|
+
*/
|
76
|
+
public List<InputSplit> getSplits(JobContext context) {
|
77
|
+
setConf(context.getConfiguration());
|
78
|
+
List<InputSplit> splits = new ArrayList<InputSplit>(numSplits.intValue());
|
79
|
+
for(int i = 0; i < numSplits; i++) {
|
80
|
+
Long size = (numSplitRecords == 1) ? 1 : numSplitRecords-1;
|
81
|
+
splits.add(new ElasticSearchSplit(queryString, i*numSplitRecords, size));
|
82
|
+
}
|
83
|
+
if (numHits % numSplits > 0) splits.add(new ElasticSearchSplit(queryString, numSplits*numSplitRecords, numHits % numSplits - 1));
|
84
|
+
LOG.info("Created ["+splits.size()+"] splits for ["+numHits+"] hits");
|
85
|
+
return splits;
|
86
|
+
}
|
87
|
+
|
88
|
+
/**
|
89
|
+
Sets the configuration object, opens a connection to elasticsearch, and
|
90
|
+
initiates the initial search request.
|
91
|
+
*/
|
92
|
+
@Override
|
93
|
+
public void setConf(Configuration configuration) {
|
94
|
+
this.conf = configuration;
|
95
|
+
this.indexName = conf.get(ES_INDEX_NAME);
|
96
|
+
this.objType = conf.get(ES_OBJECT_TYPE);
|
97
|
+
this.requestSize = Integer.parseInt(conf.get(ES_REQUEST_SIZE));
|
98
|
+
this.numSplits = Long.parseLong(conf.get(ES_NUM_SPLITS));
|
99
|
+
this.queryString = conf.get(ES_QUERY_STRING);
|
100
|
+
|
101
|
+
//
|
102
|
+
// Need to ensure that this is set in the hadoop configuration so we can
|
103
|
+
// instantiate a local client. The reason is that no files are in the
|
104
|
+
// distributed cache when this is called.
|
105
|
+
//
|
106
|
+
System.setProperty(ES_CONFIG, conf.get(ES_CONFIG));
|
107
|
+
System.setProperty(ES_PLUGINS, conf.get(ES_PLUGINS));
|
108
|
+
|
109
|
+
start_embedded_client();
|
110
|
+
|
111
|
+
initiate_search();
|
112
|
+
}
|
113
|
+
|
114
|
+
@Override
|
115
|
+
public Configuration getConf() {
|
116
|
+
return conf;
|
117
|
+
}
|
118
|
+
|
119
|
+
/**
|
120
|
+
Starts an embedded elasticsearch client (ie. data = false)
|
121
|
+
*/
|
122
|
+
private void start_embedded_client() {
|
123
|
+
LOG.info("Starting embedded elasticsearch client ...");
|
124
|
+
this.node = NodeBuilder.nodeBuilder().client(true).node();
|
125
|
+
this.client = node.client();
|
126
|
+
}
|
127
|
+
|
128
|
+
private void initiate_search() {
|
129
|
+
SearchResponse response = client.prepareSearch(indexName)
|
130
|
+
.setTypes(objType)
|
131
|
+
.setSearchType(SearchType.COUNT)
|
132
|
+
.setQuery(QueryBuilders.queryString(queryString))
|
133
|
+
.setSize(requestSize)
|
134
|
+
.execute()
|
135
|
+
.actionGet();
|
136
|
+
this.numHits = response.hits().totalHits();
|
137
|
+
if(numSplits > numHits) numSplits = numHits; // This could be bad
|
138
|
+
this.numSplitRecords = (numHits/numSplits);
|
139
|
+
}
|
140
|
+
|
141
|
+
protected class ElasticSearchRecordReader extends RecordReader<Text, Text> {
|
142
|
+
|
143
|
+
private Node node;
|
144
|
+
private Client client;
|
145
|
+
|
146
|
+
private String indexName;
|
147
|
+
private String objType;
|
148
|
+
private Long numSplitRecords;
|
149
|
+
private Integer requestSize;
|
150
|
+
private Text currentKey;
|
151
|
+
private Text currentValue;
|
152
|
+
private Integer recordsRead;
|
153
|
+
private Iterator<SearchHit> hitsItr = null;
|
154
|
+
|
155
|
+
|
156
|
+
private String queryString;
|
157
|
+
private Long from;
|
158
|
+
private Long recsToRead;
|
159
|
+
|
160
|
+
public ElasticSearchRecordReader() {
|
161
|
+
}
|
162
|
+
|
163
|
+
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException {
|
164
|
+
Configuration conf = context.getConfiguration();
|
165
|
+
this.indexName = conf.get(ES_INDEX_NAME);
|
166
|
+
this.objType = conf.get(ES_OBJECT_TYPE);
|
167
|
+
LOG.info("Initializing elasticsearch record reader on index ["+indexName+"] and object type ["+objType+"]");
|
168
|
+
|
169
|
+
//
|
170
|
+
// Fetches elasticsearch.yml and the plugins directory from the distributed cache
|
171
|
+
//
|
172
|
+
try {
|
173
|
+
String taskConfigPath = HadoopUtils.fetchFileFromCache(ES_CONFIG_NAME, conf);
|
174
|
+
LOG.info("Using ["+taskConfigPath+"] as es.config");
|
175
|
+
String taskPluginsPath = HadoopUtils.fetchArchiveFromCache(ES_PLUGINS_NAME, conf);
|
176
|
+
LOG.info("Using ["+taskPluginsPath+"] as es.plugins.dir");
|
177
|
+
System.setProperty(ES_CONFIG, taskConfigPath);
|
178
|
+
System.setProperty(ES_PLUGINS, taskPluginsPath+SLASH+ES_PLUGINS_NAME);
|
179
|
+
} catch (Exception e) {
|
180
|
+
throw new RuntimeException(e);
|
181
|
+
}
|
182
|
+
|
183
|
+
queryString = ((ElasticSearchSplit)split).getQueryString();
|
184
|
+
from = ((ElasticSearchSplit)split).getFrom();
|
185
|
+
recsToRead = ((ElasticSearchSplit)split).getSize();
|
186
|
+
|
187
|
+
LOG.info("elasticsearch record reader: query ["+queryString+"], from ["+from+"], size ["+recsToRead+"]");
|
188
|
+
start_embedded_client();
|
189
|
+
recordsRead = 0;
|
190
|
+
}
|
191
|
+
|
192
|
+
/**
|
193
|
+
Starts an embedded elasticsearch client (ie. data = false)
|
194
|
+
*/
|
195
|
+
private void start_embedded_client() {
|
196
|
+
LOG.info("Starting embedded elasticsearch client ...");
|
197
|
+
this.node = NodeBuilder.nodeBuilder().client(true).node();
|
198
|
+
this.client = node.client();
|
199
|
+
}
|
200
|
+
|
201
|
+
private Iterator<SearchHit> fetchNextHits() {
|
202
|
+
SearchResponse response = client.prepareSearch(indexName)
|
203
|
+
.setTypes(objType)
|
204
|
+
.setFrom(from.intValue())
|
205
|
+
.setSize(recsToRead.intValue())
|
206
|
+
.setQuery(QueryBuilders.queryString(queryString))
|
207
|
+
.execute()
|
208
|
+
.actionGet();
|
209
|
+
return response.hits().iterator();
|
210
|
+
}
|
211
|
+
|
212
|
+
@Override
|
213
|
+
public boolean nextKeyValue() throws IOException {
|
214
|
+
if (hitsItr!=null) {
|
215
|
+
if (recordsRead < recsToRead) {
|
216
|
+
if (hitsItr.hasNext()) {
|
217
|
+
SearchHit hit = hitsItr.next();
|
218
|
+
currentKey = new Text(hit.id());
|
219
|
+
currentValue = new Text(hit.sourceAsString());
|
220
|
+
recordsRead += 1;
|
221
|
+
return true;
|
222
|
+
}
|
223
|
+
} else {
|
224
|
+
hitsItr = null;
|
225
|
+
}
|
226
|
+
} else {
|
227
|
+
if (recordsRead < recsToRead) {
|
228
|
+
hitsItr = fetchNextHits();
|
229
|
+
if (hitsItr.hasNext()) {
|
230
|
+
SearchHit hit = hitsItr.next();
|
231
|
+
currentKey = new Text(hit.id());
|
232
|
+
currentValue = new Text(hit.sourceAsString());
|
233
|
+
recordsRead += 1;
|
234
|
+
return true;
|
235
|
+
}
|
236
|
+
}
|
237
|
+
}
|
238
|
+
return false;
|
239
|
+
}
|
240
|
+
|
241
|
+
@Override
|
242
|
+
public Text getCurrentKey() {
|
243
|
+
return currentKey;
|
244
|
+
}
|
245
|
+
|
246
|
+
@Override
|
247
|
+
public Text getCurrentValue() {
|
248
|
+
return currentValue;
|
249
|
+
}
|
250
|
+
|
251
|
+
@Override
|
252
|
+
public float getProgress() throws IOException {
|
253
|
+
return 0;
|
254
|
+
}
|
255
|
+
|
256
|
+
@Override
|
257
|
+
public void close() throws IOException {
|
258
|
+
LOG.info("Closing record reader");
|
259
|
+
client.close();
|
260
|
+
LOG.info("Client is closed");
|
261
|
+
if (node != null) {
|
262
|
+
node.close();
|
263
|
+
}
|
264
|
+
LOG.info("Record reader closed.");
|
265
|
+
}
|
266
|
+
|
267
|
+
}
|
268
|
+
}
|