wonderdog 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +49 -0
- data/.rspec +2 -0
- data/CHANGELOG.md +5 -0
- data/LICENSE.md +201 -0
- data/README.md +175 -0
- data/Rakefile +10 -0
- data/bin/estool +141 -0
- data/bin/estrus.rb +136 -0
- data/bin/wonderdog +93 -0
- data/config/elasticsearch-example.yml +227 -0
- data/config/elasticsearch.in.sh +52 -0
- data/config/logging.yml +43 -0
- data/config/more_settings.yml +60 -0
- data/config/run_elasticsearch-2.sh +42 -0
- data/config/ufo_config.json +12 -0
- data/lib/wonderdog.rb +14 -0
- data/lib/wonderdog/configuration.rb +25 -0
- data/lib/wonderdog/hadoop_invocation_override.rb +139 -0
- data/lib/wonderdog/index_and_mapping.rb +67 -0
- data/lib/wonderdog/timestamp.rb +43 -0
- data/lib/wonderdog/version.rb +3 -0
- data/notes/README-benchmarking.txt +272 -0
- data/notes/README-read_tuning.textile +74 -0
- data/notes/benchmarking-201011.numbers +0 -0
- data/notes/cluster_notes.md +17 -0
- data/notes/notes.txt +91 -0
- data/notes/pigstorefunc.pig +45 -0
- data/pom.xml +80 -0
- data/spec/spec_helper.rb +22 -0
- data/spec/support/driver_helper.rb +15 -0
- data/spec/support/integration_helper.rb +30 -0
- data/spec/wonderdog/hadoop_invocation_override_spec.rb +81 -0
- data/spec/wonderdog/index_and_type_spec.rb +73 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchInputFormat.java +268 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchOutputCommitter.java +39 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchOutputFormat.java +283 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchSplit.java +60 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingInputFormat.java +231 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingOutputCommitter.java +37 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingOutputFormat.java +88 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingRecordReader.java +176 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingRecordWriter.java +171 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingSplit.java +102 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticTest.java +108 -0
- data/src/main/java/com/infochimps/elasticsearch/hadoop/util/HadoopUtils.java +100 -0
- data/src/main/java/com/infochimps/elasticsearch/pig/ElasticSearchIndex.java +216 -0
- data/src/main/java/com/infochimps/elasticsearch/pig/ElasticSearchJsonIndex.java +235 -0
- data/src/main/java/com/infochimps/elasticsearch/pig/ElasticSearchStorage.java +355 -0
- data/test/foo.json +3 -0
- data/test/foo.tsv +3 -0
- data/test/test_dump.pig +19 -0
- data/test/test_json_loader.pig +21 -0
- data/test/test_tsv_loader.pig +16 -0
- data/wonderdog.gemspec +32 -0
- metadata +130 -0
@@ -0,0 +1,52 @@
|
|
1
|
+
export ES_CONF_DIR=${ES_CONF_DIR-/etc/elasticsearch}
|
2
|
+
export ES_WORK_DIR=${ES_WORK_DIR-/mnt/elasticsearch/work}
|
3
|
+
export ES_DATA_DIR=${ES_DATA_DIR-/mnt/elasticsearch/data}
|
4
|
+
|
5
|
+
export CLASSPATH=$ES_HOME/plugins/cloud-aws.zip
|
6
|
+
CLASSPATH=$CLASSPATH:$ES_HOME/lib/elasticsearch-0.11.0.jar:$ES_HOME/lib/*:$ES_HOME/lib/sigar/*
|
7
|
+
|
8
|
+
# bump the # of open files way way up
|
9
|
+
ulimit -n 65536
|
10
|
+
# allow elasticsearch to lock itself into memory if JNA is installed
|
11
|
+
ulimit -l unlimited
|
12
|
+
|
13
|
+
if [ "x$ES_MIN_MEM" = "x" ]; then
|
14
|
+
ES_MIN_MEM=256m
|
15
|
+
fi
|
16
|
+
if [ "x$ES_MAX_MEM" = "x" ]; then
|
17
|
+
ES_MAX_MEM=1500m
|
18
|
+
fi
|
19
|
+
|
20
|
+
# Arguments to pass to the JVM
|
21
|
+
JAVA_OPTS="$JAVA_OPTS -Xms${ES_MIN_MEM}"
|
22
|
+
JAVA_OPTS="$JAVA_OPTS -Xmx${ES_MAX_MEM}"
|
23
|
+
JAVA_OPTS="$JAVA_OPTS -Xss128k"
|
24
|
+
|
25
|
+
JAVA_OPTS="$JAVA_OPTS -Djline.enabled=true"
|
26
|
+
|
27
|
+
JAVA_OPTS="$JAVA_OPTS -XX:+AggressiveOpts"
|
28
|
+
|
29
|
+
JAVA_OPTS="$JAVA_OPTS -XX:+UseParNewGC"
|
30
|
+
JAVA_OPTS="$JAVA_OPTS -XX:+UseConcMarkSweepGC"
|
31
|
+
JAVA_OPTS="$JAVA_OPTS -XX:+CMSParallelRemarkEnabled"
|
32
|
+
JAVA_OPTS="$JAVA_OPTS -XX:SurvivorRatio=8"
|
33
|
+
JAVA_OPTS="$JAVA_OPTS -XX:MaxTenuringThreshold=1"
|
34
|
+
JAVA_OPTS="$JAVA_OPTS -XX:+HeapDumpOnOutOfMemoryError"
|
35
|
+
JAVA_OPTS="$JAVA_OPTS -XX:HeapDumpPath=$ES_WORK_DIR/heap"
|
36
|
+
JAVA_OPTS="$JAVA_OPTS -XX:+PrintGCTimeStamps -XX:+PrintTenuringDistribution -XX:+TraceClassUnloading -XX:+PrintGCDetails -verbose:gc -Xloggc:/var/log/elasticsearch/elasticsearch-gc.log"
|
37
|
+
|
38
|
+
JAVA_OPTS="$JAVA_OPTS -XX:+UseCompressedOops" # avoid this on sun java < 1.6.0_20
|
39
|
+
|
40
|
+
# ensures JMX accessible from outside world
|
41
|
+
JAVA_OPTS="$JAVA_OPTS -Dcom.sun.management.jmxremote.ssl=false -Dcom.sun.management.jmxremote.authenticate=false -Djava.rmi.server.hostname=ec2-184-73-69-18.compute-1.amazonaws.com "
|
42
|
+
|
43
|
+
# More options to consider LATER
|
44
|
+
# java.net.preferIPv4Stack=true: Better OOTB experience, especially with jgroups
|
45
|
+
# -XX:CMSInitiatingOccupancyFraction=88
|
46
|
+
|
47
|
+
ES_JAVA_OPTS="$ES_JAVA_OPTS -Des.path.data=$ES_DATA_DIR -Des.path.work=$ES_WORK_DIR"
|
48
|
+
|
49
|
+
echo JAVA_OPTS="'$JAVA_OPTS'"
|
50
|
+
echo ES_JAVA_OPTS="'$ES_JAVA_OPTS'"
|
51
|
+
|
52
|
+
export JAVA_OPTS ES_JAVA_OPTS ES_MAX_MEM ES_MIN_MEM
|
data/config/logging.yml
ADDED
@@ -0,0 +1,43 @@
|
|
1
|
+
rootLogger: DEBUG, console, file
|
2
|
+
|
3
|
+
#
|
4
|
+
# Put the name of any module -- using its config path -- in the section below.
|
5
|
+
#
|
6
|
+
logger:
|
7
|
+
# log action execution errors for easier debugging
|
8
|
+
action : DEBUG
|
9
|
+
|
10
|
+
index:
|
11
|
+
shard:
|
12
|
+
recovery: DEBUG
|
13
|
+
store: INFO
|
14
|
+
gateway: DEBUG
|
15
|
+
engine: DEBUG
|
16
|
+
merge: DEBUG
|
17
|
+
translog: DEBUG
|
18
|
+
cluster:
|
19
|
+
service: DEBUG
|
20
|
+
action:
|
21
|
+
shard: DEBUG
|
22
|
+
gateway: DEBUG
|
23
|
+
discovery: DEBUG
|
24
|
+
jmx: DEBUG
|
25
|
+
httpclient: INFO
|
26
|
+
node: DEBUG
|
27
|
+
plugins: DEBUG
|
28
|
+
|
29
|
+
appender:
|
30
|
+
console:
|
31
|
+
type: console
|
32
|
+
layout:
|
33
|
+
type: consolePattern
|
34
|
+
conversionPattern: "[%d{ABSOLUTE}][%-5p][%-25c] %m%n"
|
35
|
+
|
36
|
+
file:
|
37
|
+
type: dailyRollingFile
|
38
|
+
file: ${path.logs}/${cluster.name}.log
|
39
|
+
datePattern: "'.'yyyy-MM-dd"
|
40
|
+
layout:
|
41
|
+
type: pattern
|
42
|
+
conversionPattern: "[%d{ABSOLUTE}][%-5p][%-25c] %m%n"
|
43
|
+
|
@@ -0,0 +1,60 @@
|
|
1
|
+
#
|
2
|
+
# This file isn't read for any reason -- it's
|
3
|
+
# a dumping ground for annotated config sections
|
4
|
+
#
|
5
|
+
|
6
|
+
|
7
|
+
gateway:
|
8
|
+
# Settings for gateway.type = s3
|
9
|
+
s3:
|
10
|
+
bucket: infochimps-elasticsearch
|
11
|
+
|
12
|
+
gateway:
|
13
|
+
fs:
|
14
|
+
# By default, uses the 'path.work' directory Note, the work directory is
|
15
|
+
# considered a temporal directory with ElasticSearch (meaning it is safe
|
16
|
+
# to rm -rf it), the default location of the persistent gateway in work
|
17
|
+
# intentional, it should be changed.
|
18
|
+
#
|
19
|
+
# When explicitly specifying the gateway.fs.location, each node will
|
20
|
+
# append its cluster.name to the provided location. It means that the
|
21
|
+
# location provided can safely support several clusters.
|
22
|
+
#
|
23
|
+
# The file system gateway automatically sets for each index created to use
|
24
|
+
# an fs index gateway. The location specified using gateway.fs.location
|
25
|
+
# will automatically be used in this case to store index level data
|
26
|
+
# (appended by the index name).
|
27
|
+
location: /mnt2/elasticsearch/fs
|
28
|
+
|
29
|
+
discovery:
|
30
|
+
|
31
|
+
zen:
|
32
|
+
# == How should gossip be conducted?
|
33
|
+
ping:
|
34
|
+
multicast:
|
35
|
+
enabled: false
|
36
|
+
# group: 224.2.2.4
|
37
|
+
# port: 54328
|
38
|
+
# ttl: 3
|
39
|
+
# address: null
|
40
|
+
unicast:
|
41
|
+
# # Either a YAML array or a comma delimited string.
|
42
|
+
# # Each value is either in the form of host:port, or in the form of host[port1-port2].
|
43
|
+
# hosts:
|
44
|
+
# == Zen master election:
|
45
|
+
# As part of the initial ping process a master of the cluster is either
|
46
|
+
# elected or joined to. This is done automatically. The
|
47
|
+
# discovery.zen.initial_ping_timeout (which defaults to 3s) allows to
|
48
|
+
# configure the election to handle cases of slow or congested networks
|
49
|
+
# (higher values assure less chance of failure).
|
50
|
+
initial_ping_timeout: 3s
|
51
|
+
# # Allow node to become master? Note, once a node is a client node
|
52
|
+
# # (node.client = true), it will not be allowed to become a master
|
53
|
+
# # (zen.master is automatically set to false).
|
54
|
+
# master: ~
|
55
|
+
# == Zen Fault detection:
|
56
|
+
fd:
|
57
|
+
ping_interval: 1s
|
58
|
+
ping_timeout: 30s
|
59
|
+
ping_retries 3
|
60
|
+
|
@@ -0,0 +1,42 @@
|
|
1
|
+
#!/usr/bin/env bash
|
2
|
+
|
3
|
+
#
|
4
|
+
# This lets you run multiple daemons on the same machine. It points each
|
5
|
+
# daemon's data to /mnt$node/elasticsearch -- so running it with node='' will
|
6
|
+
# write to /mnt/elasticsearch, node=3 will write to /mnt3/elasticsearch.
|
7
|
+
#
|
8
|
+
# Usage:
|
9
|
+
#
|
10
|
+
# sudo node=$node ES_MAX_MEM=1800m ./config/run_elasticsearch-2.sh ; done
|
11
|
+
#
|
12
|
+
# To run multiple nodes:
|
13
|
+
#
|
14
|
+
# for node in '' 2 3 ; do sudo node=$node ES_MAX_MEM=1800m ./config/run_elasticsearch-2.sh ; done
|
15
|
+
#
|
16
|
+
|
17
|
+
# Which node?
|
18
|
+
node=${node-''}
|
19
|
+
echo "Running elasticsearch with node=$node"
|
20
|
+
|
21
|
+
# Where does elasticsearch live?
|
22
|
+
export ES_HOME=/usr/local/share/elasticsearch
|
23
|
+
export ES_CONF_DIR=/etc/elasticsearch
|
24
|
+
export ES_INCLUDE=$ES_CONF_DIR/elasticsearch.in.sh
|
25
|
+
|
26
|
+
# Where does data live?
|
27
|
+
ES_DATA_ROOT=/mnt$node/elasticsearch
|
28
|
+
export ES_DATA_DIR=$ES_DATA_ROOT/data
|
29
|
+
export ES_WORK_DIR=$ES_DATA_ROOT/work
|
30
|
+
|
31
|
+
# bump the # of open files way way up
|
32
|
+
ulimit -n 65536
|
33
|
+
# allow elasticsearch to lock itself into memory if JNA is installed
|
34
|
+
ulimit -l unlimited
|
35
|
+
|
36
|
+
# Force the heap size
|
37
|
+
export ES_MAX_MEM=${ES_MAX_MEM-1800m}
|
38
|
+
export ES_MIN_MEM=$ES_MAX_MEM
|
39
|
+
|
40
|
+
exec chpst -u elasticsearch $ES_HOME/bin/elasticsearch \
|
41
|
+
-Des.config=/etc/elasticsearch/elasticsearch.yml \
|
42
|
+
-p /var/run/elasticsearch/es-$node.pid
|
@@ -0,0 +1,12 @@
|
|
1
|
+
{
|
2
|
+
"ufo_sighting" : {
|
3
|
+
"properties" : {
|
4
|
+
"sighted_at" : {"type" : "string", "store" : "yes"},
|
5
|
+
"reported_at" : {"type" : "string", "store" : "yes"},
|
6
|
+
"location" : {"type" : "string", "store" : "yes"},
|
7
|
+
"shape" : {"type" : "string", "store" : "yes"},
|
8
|
+
"duration" : {"type" : "string", "store" : "yes"},
|
9
|
+
"description" : {"type" : "string", "store" : "yes"}
|
10
|
+
}
|
11
|
+
}
|
12
|
+
}
|
data/lib/wonderdog.rb
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
require 'wukong-hadoop'
|
2
|
+
|
3
|
+
module Wukong
|
4
|
+
|
5
|
+
# Wonderdog provides Java code that couples Hadoop streaming to
|
6
|
+
# Wukong. This module adds some overrides which enables the
|
7
|
+
# <tt>wu-hadoop</tt> program to leverage this code.
|
8
|
+
module Elasticsearch
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
require 'wonderdog/configuration'
|
13
|
+
require 'wonderdog/hadoop_invocation_override'
|
14
|
+
require 'wonderdog/timestamp'
|
@@ -0,0 +1,25 @@
|
|
1
|
+
module Wukong
|
2
|
+
module Elasticsearch
|
3
|
+
|
4
|
+
# Configure the given +settings+ to be able to work with
|
5
|
+
# Elasticsearch.
|
6
|
+
#
|
7
|
+
# @param [Configliere::Param] settings
|
8
|
+
# @return [Configliere::Param] the newly configured settings
|
9
|
+
def self.configure settings
|
10
|
+
settings.define(:es_tmp_dir, :description => "Temporary directory on the HDFS to store job files while reading/writing to ElasticSearch", :default => "/user/#{ENV['USER']}/wukong", :wukong_hadoop => true)
|
11
|
+
settings.define(:es_config, :description => "Where to find configuration files detailing how to join an ElasticSearch cluster", :wukong_hadoop => true)
|
12
|
+
settings.define(:es_input_splits, :description => "Number of input splits to target when reading from ElasticSearch", :type => Integer, :wukong_hadoop => true)
|
13
|
+
settings.define(:es_request_size, :description => "Number of objects requested during each batch read from ElasticSearch", :type => Integer, :wukong_hadoop => true)
|
14
|
+
settings.define(:es_scroll_timeout, :description => "Amount of time to wait on a scroll", :wukong_hadoop => true)
|
15
|
+
settings.define(:es_index_field, :description => "Field to use from each record to override the default index", :wukong_hadoop => true)
|
16
|
+
settings.define(:es_mapping_field, :description => "Field to use from each record to override the default mapping", :wukong_hadoop => true)
|
17
|
+
settings.define(:es_id_field, :description => "If this field is present in a record, make an update request, otherwise make a create request", :wukong_hadoop => true)
|
18
|
+
settings.define(:es_bulk_size, :description => "Number of requests to batch locally before making a request to ElasticSearch", :type => Integer, :wukong_hadoop => true)
|
19
|
+
settings.define(:es_query, :description => "Query to use when defining input splits for ElasticSearch input", :wukong_hadoop => true)
|
20
|
+
|
21
|
+
settings
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
end
|
@@ -0,0 +1,139 @@
|
|
1
|
+
require_relative("index_and_mapping")
|
2
|
+
|
3
|
+
module Wukong
|
4
|
+
module Elasticsearch
|
5
|
+
|
6
|
+
# This module overrides some methods defined in
|
7
|
+
# Wukong::Hadoop::HadoopInvocation. The overrides will only come
|
8
|
+
# into play if the job's input or output paths are URIs beginning
|
9
|
+
# with 'es://', implying reading or writing to/from Elasticsearch
|
10
|
+
# indices.
|
11
|
+
module HadoopInvocationOverride
|
12
|
+
|
13
|
+
# The input format when reading from Elasticsearch as defined in
|
14
|
+
# the Java code accompanying Wonderdog.
|
15
|
+
#
|
16
|
+
# @param [String]
|
17
|
+
ES_STREAMING_INPUT_FORMAT = "com.infochimps.elasticsearch.ElasticSearchStreamingInputFormat"
|
18
|
+
|
19
|
+
# The output format when writing to Elasticsearch as defined in
|
20
|
+
# the Java code accompanying Wonderdog.
|
21
|
+
#
|
22
|
+
# @param [String]
|
23
|
+
ES_STREAMING_OUTPUT_FORMAT = "com.infochimps.elasticsearch.ElasticSearchStreamingOutputFormat"
|
24
|
+
|
25
|
+
# Does this job read from Elasticsearch?
|
26
|
+
#
|
27
|
+
# @return [true, false]
|
28
|
+
def reads_from_elasticsearch?
|
29
|
+
IndexAndMapping.matches?(settings[:input])
|
30
|
+
end
|
31
|
+
|
32
|
+
# The input format to use for this job.
|
33
|
+
#
|
34
|
+
# Will override the default value to ES_STREAMING_INPUT_FORMAT if
|
35
|
+
# reading from Elasticsearch.
|
36
|
+
#
|
37
|
+
# @return [String]
|
38
|
+
def input_format
|
39
|
+
reads_from_elasticsearch? ? ES_STREAMING_INPUT_FORMAT : super()
|
40
|
+
end
|
41
|
+
|
42
|
+
# The input index to use.
|
43
|
+
#
|
44
|
+
# @return [IndexAndMapping]
|
45
|
+
def input_index
|
46
|
+
@input_index ||= IndexAndMapping.new(settings[:input])
|
47
|
+
end
|
48
|
+
|
49
|
+
# The input paths to use for this job.
|
50
|
+
#
|
51
|
+
# Will override the default value with a temporary HDFS path
|
52
|
+
# when reading from Elasticsearch.
|
53
|
+
#
|
54
|
+
# @return [String]
|
55
|
+
def input_paths
|
56
|
+
reads_from_elasticsearch? ? elasticsearch_hdfs_tmp_dir(input_index) : super()
|
57
|
+
end
|
58
|
+
|
59
|
+
# Does this write to Elasticsearch?
|
60
|
+
#
|
61
|
+
# @return [true, false]
|
62
|
+
def writes_to_elasticsearch?
|
63
|
+
IndexAndMapping.matches?(settings[:output])
|
64
|
+
end
|
65
|
+
|
66
|
+
# The output format to use for this job.
|
67
|
+
#
|
68
|
+
# Will override the default value to ES_STREAMING_OUTPUT_FORMAT if
|
69
|
+
# writing to Elasticsearch.
|
70
|
+
#
|
71
|
+
# @return [String]
|
72
|
+
def output_format
|
73
|
+
writes_to_elasticsearch? ? ES_STREAMING_OUTPUT_FORMAT : super()
|
74
|
+
end
|
75
|
+
|
76
|
+
# The output index to use.
|
77
|
+
#
|
78
|
+
# @return [IndexAndMapping]
|
79
|
+
def output_index
|
80
|
+
@output_index ||= IndexAndMapping.new(settings[:output])
|
81
|
+
end
|
82
|
+
|
83
|
+
# The output path to use for this job.
|
84
|
+
#
|
85
|
+
# Will override the default value with a temporary HDFS path
|
86
|
+
# when writing to Elasticsearch.
|
87
|
+
#
|
88
|
+
# @return [String]
|
89
|
+
def output_path
|
90
|
+
writes_to_elasticsearch? ? elasticsearch_hdfs_tmp_dir(output_index) : super()
|
91
|
+
end
|
92
|
+
|
93
|
+
# Adds Java options required to interact with the input/output
|
94
|
+
# formats defined by the Java code accompanying Wonderdog.
|
95
|
+
#
|
96
|
+
# Will not change the default Hadoop jobconf options unless it
|
97
|
+
# has to.
|
98
|
+
#
|
99
|
+
# @return [Array<String>]
|
100
|
+
def hadoop_jobconf_options
|
101
|
+
super() + [].tap do |o|
|
102
|
+
o << java_opt('es.config', settings[:es_config]) if (reads_from_elasticsearch? || writes_to_elasticsearch?)
|
103
|
+
|
104
|
+
if reads_from_elasticsearch?
|
105
|
+
o << java_opt('elasticsearch.input.index', input_index.index)
|
106
|
+
o << java_opt('elasticsearch.input.mapping', input_index.mapping)
|
107
|
+
o << java_opt('elasticsearch.input.splits', settings[:es_input_splits])
|
108
|
+
o << java_opt('elasticsearch.input.query', settings[:es_query])
|
109
|
+
o << java_opt('elasticsearch.input.request_size', settings[:es_request_size])
|
110
|
+
o << java_opt('elasticsearch.input.scroll_timeout', settings[:es_scroll_timeout])
|
111
|
+
end
|
112
|
+
|
113
|
+
if writes_to_elasticsearch?
|
114
|
+
o << java_opt('elasticsearch.output.index', output_index.index)
|
115
|
+
o << java_opt('elasticsearch.output.mapping', output_index.mapping)
|
116
|
+
o << java_opt('elasticsearch.output.index.field', settings[:es_index_field])
|
117
|
+
o << java_opt('elasticsearch.output.mapping.field', settings[:es_mapping_field])
|
118
|
+
o << java_opt('elasticsearch.output.id.field', settings[:es_id_field])
|
119
|
+
o << java_opt('elasticsearch.output.bulk_size', settings[:es_bulk_size])
|
120
|
+
end
|
121
|
+
end.flatten.compact
|
122
|
+
end
|
123
|
+
|
124
|
+
# Returns a temporary path on the HDFS in which to store log
|
125
|
+
# data while the Hadoop job runs.
|
126
|
+
#
|
127
|
+
# @param [IndexAndMapping] io
|
128
|
+
# @return [String]
|
129
|
+
def elasticsearch_hdfs_tmp_dir io
|
130
|
+
cleaner = %r{[^\w/\.\-\+]+}
|
131
|
+
io_part = [io.index, io.mapping].compact.map { |s| s.gsub(cleaner, '') }.join('/')
|
132
|
+
File.join(settings[:es_tmp_dir], io_part, Time.now.strftime("%Y-%m-%d-%H-%M-%S"))
|
133
|
+
end
|
134
|
+
|
135
|
+
end
|
136
|
+
end
|
137
|
+
|
138
|
+
Hadoop::Driver.class_eval { include Elasticsearch::HadoopInvocationOverride }
|
139
|
+
end
|
@@ -0,0 +1,67 @@
|
|
1
|
+
module Wukong
|
2
|
+
module Elasticsearch
|
3
|
+
|
4
|
+
# A convenient class for parsing Elasticsearch index and mapping URIs
|
5
|
+
# like
|
6
|
+
#
|
7
|
+
# - es://my_index
|
8
|
+
# - es://my_index/my_mapping
|
9
|
+
# - es://first_index,second_index,third_index
|
10
|
+
# - es://my_index/first_mapping,second_mapping,third_mapping
|
11
|
+
class IndexAndMapping
|
12
|
+
|
13
|
+
# A regular expression that matches URIs describing an
|
14
|
+
# Elasticsearch index and/or mapping to read/write from/to.
|
15
|
+
#
|
16
|
+
# @param [Regexp]
|
17
|
+
ES_SCHEME_REGEXP = %r{^es://}
|
18
|
+
|
19
|
+
# The Elasticsearch index.
|
20
|
+
#
|
21
|
+
# @param [String]
|
22
|
+
attr_reader :index
|
23
|
+
|
24
|
+
# The Elasticsearch mapping.
|
25
|
+
#
|
26
|
+
# @param [String]
|
27
|
+
attr_reader :mapping
|
28
|
+
|
29
|
+
# Does the given +string+ look like a possible Elasticsearch
|
30
|
+
# /index/mapping specification?
|
31
|
+
#
|
32
|
+
# @param [String] string
|
33
|
+
# @return [true, false]
|
34
|
+
def self.matches? string
|
35
|
+
return false unless string
|
36
|
+
string =~ ES_SCHEME_REGEXP
|
37
|
+
end
|
38
|
+
|
39
|
+
# Create a new index and mapping specification from the given
|
40
|
+
# +uri..
|
41
|
+
#
|
42
|
+
# @param [String] uri
|
43
|
+
def initialize uri
|
44
|
+
self.uri = uri
|
45
|
+
end
|
46
|
+
|
47
|
+
# Set the URI of this index and mapping specification, parsing it
|
48
|
+
# for an index and mapping.
|
49
|
+
#
|
50
|
+
# Will raise an error if the given URI is malformed.
|
51
|
+
#
|
52
|
+
# @param [String] uri
|
53
|
+
def uri= uri
|
54
|
+
raise Wukong::Error.new("'#{uri}' is not an ElasticSearch es://index/mapping specification") unless self.class.matches?(uri)
|
55
|
+
parts = uri.gsub(ES_SCHEME_REGEXP, '').gsub(/^\/+/,'').gsub(/\/+$/,'').split('/')
|
56
|
+
|
57
|
+
raise Wukong::Error.new("'#{uri}' is not an ElasticSearch es://index/mapping specification") unless parts.size.between?(1,2)
|
58
|
+
|
59
|
+
@index = parts[0]
|
60
|
+
@mapping = parts[1]
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
|
67
|
+
|