wonderdog 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (55) hide show
  1. data/.gitignore +49 -0
  2. data/.rspec +2 -0
  3. data/CHANGELOG.md +5 -0
  4. data/LICENSE.md +201 -0
  5. data/README.md +175 -0
  6. data/Rakefile +10 -0
  7. data/bin/estool +141 -0
  8. data/bin/estrus.rb +136 -0
  9. data/bin/wonderdog +93 -0
  10. data/config/elasticsearch-example.yml +227 -0
  11. data/config/elasticsearch.in.sh +52 -0
  12. data/config/logging.yml +43 -0
  13. data/config/more_settings.yml +60 -0
  14. data/config/run_elasticsearch-2.sh +42 -0
  15. data/config/ufo_config.json +12 -0
  16. data/lib/wonderdog.rb +14 -0
  17. data/lib/wonderdog/configuration.rb +25 -0
  18. data/lib/wonderdog/hadoop_invocation_override.rb +139 -0
  19. data/lib/wonderdog/index_and_mapping.rb +67 -0
  20. data/lib/wonderdog/timestamp.rb +43 -0
  21. data/lib/wonderdog/version.rb +3 -0
  22. data/notes/README-benchmarking.txt +272 -0
  23. data/notes/README-read_tuning.textile +74 -0
  24. data/notes/benchmarking-201011.numbers +0 -0
  25. data/notes/cluster_notes.md +17 -0
  26. data/notes/notes.txt +91 -0
  27. data/notes/pigstorefunc.pig +45 -0
  28. data/pom.xml +80 -0
  29. data/spec/spec_helper.rb +22 -0
  30. data/spec/support/driver_helper.rb +15 -0
  31. data/spec/support/integration_helper.rb +30 -0
  32. data/spec/wonderdog/hadoop_invocation_override_spec.rb +81 -0
  33. data/spec/wonderdog/index_and_type_spec.rb +73 -0
  34. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchInputFormat.java +268 -0
  35. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchOutputCommitter.java +39 -0
  36. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchOutputFormat.java +283 -0
  37. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchSplit.java +60 -0
  38. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingInputFormat.java +231 -0
  39. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingOutputCommitter.java +37 -0
  40. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingOutputFormat.java +88 -0
  41. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingRecordReader.java +176 -0
  42. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingRecordWriter.java +171 -0
  43. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingSplit.java +102 -0
  44. data/src/main/java/com/infochimps/elasticsearch/ElasticTest.java +108 -0
  45. data/src/main/java/com/infochimps/elasticsearch/hadoop/util/HadoopUtils.java +100 -0
  46. data/src/main/java/com/infochimps/elasticsearch/pig/ElasticSearchIndex.java +216 -0
  47. data/src/main/java/com/infochimps/elasticsearch/pig/ElasticSearchJsonIndex.java +235 -0
  48. data/src/main/java/com/infochimps/elasticsearch/pig/ElasticSearchStorage.java +355 -0
  49. data/test/foo.json +3 -0
  50. data/test/foo.tsv +3 -0
  51. data/test/test_dump.pig +19 -0
  52. data/test/test_json_loader.pig +21 -0
  53. data/test/test_tsv_loader.pig +16 -0
  54. data/wonderdog.gemspec +32 -0
  55. metadata +130 -0
@@ -0,0 +1,52 @@
1
+ export ES_CONF_DIR=${ES_CONF_DIR-/etc/elasticsearch}
2
+ export ES_WORK_DIR=${ES_WORK_DIR-/mnt/elasticsearch/work}
3
+ export ES_DATA_DIR=${ES_DATA_DIR-/mnt/elasticsearch/data}
4
+
5
+ export CLASSPATH=$ES_HOME/plugins/cloud-aws.zip
6
+ CLASSPATH=$CLASSPATH:$ES_HOME/lib/elasticsearch-0.11.0.jar:$ES_HOME/lib/*:$ES_HOME/lib/sigar/*
7
+
8
+ # bump the # of open files way way up
9
+ ulimit -n 65536
10
+ # allow elasticsearch to lock itself into memory if JNA is installed
11
+ ulimit -l unlimited
12
+
13
+ if [ "x$ES_MIN_MEM" = "x" ]; then
14
+ ES_MIN_MEM=256m
15
+ fi
16
+ if [ "x$ES_MAX_MEM" = "x" ]; then
17
+ ES_MAX_MEM=1500m
18
+ fi
19
+
20
+ # Arguments to pass to the JVM
21
+ JAVA_OPTS="$JAVA_OPTS -Xms${ES_MIN_MEM}"
22
+ JAVA_OPTS="$JAVA_OPTS -Xmx${ES_MAX_MEM}"
23
+ JAVA_OPTS="$JAVA_OPTS -Xss128k"
24
+
25
+ JAVA_OPTS="$JAVA_OPTS -Djline.enabled=true"
26
+
27
+ JAVA_OPTS="$JAVA_OPTS -XX:+AggressiveOpts"
28
+
29
+ JAVA_OPTS="$JAVA_OPTS -XX:+UseParNewGC"
30
+ JAVA_OPTS="$JAVA_OPTS -XX:+UseConcMarkSweepGC"
31
+ JAVA_OPTS="$JAVA_OPTS -XX:+CMSParallelRemarkEnabled"
32
+ JAVA_OPTS="$JAVA_OPTS -XX:SurvivorRatio=8"
33
+ JAVA_OPTS="$JAVA_OPTS -XX:MaxTenuringThreshold=1"
34
+ JAVA_OPTS="$JAVA_OPTS -XX:+HeapDumpOnOutOfMemoryError"
35
+ JAVA_OPTS="$JAVA_OPTS -XX:HeapDumpPath=$ES_WORK_DIR/heap"
36
+ JAVA_OPTS="$JAVA_OPTS -XX:+PrintGCTimeStamps -XX:+PrintTenuringDistribution -XX:+TraceClassUnloading -XX:+PrintGCDetails -verbose:gc -Xloggc:/var/log/elasticsearch/elasticsearch-gc.log"
37
+
38
+ JAVA_OPTS="$JAVA_OPTS -XX:+UseCompressedOops" # avoid this on sun java < 1.6.0_20
39
+
40
+ # ensures JMX accessible from outside world
41
+ JAVA_OPTS="$JAVA_OPTS -Dcom.sun.management.jmxremote.ssl=false -Dcom.sun.management.jmxremote.authenticate=false -Djava.rmi.server.hostname=ec2-184-73-69-18.compute-1.amazonaws.com "
42
+
43
+ # More options to consider LATER
44
+ # java.net.preferIPv4Stack=true: Better OOTB experience, especially with jgroups
45
+ # -XX:CMSInitiatingOccupancyFraction=88
46
+
47
+ ES_JAVA_OPTS="$ES_JAVA_OPTS -Des.path.data=$ES_DATA_DIR -Des.path.work=$ES_WORK_DIR"
48
+
49
+ echo JAVA_OPTS="'$JAVA_OPTS'"
50
+ echo ES_JAVA_OPTS="'$ES_JAVA_OPTS'"
51
+
52
+ export JAVA_OPTS ES_JAVA_OPTS ES_MAX_MEM ES_MIN_MEM
@@ -0,0 +1,43 @@
1
+ rootLogger: DEBUG, console, file
2
+
3
+ #
4
+ # Put the name of any module -- using its config path -- in the section below.
5
+ #
6
+ logger:
7
+ # log action execution errors for easier debugging
8
+ action : DEBUG
9
+
10
+ index:
11
+ shard:
12
+ recovery: DEBUG
13
+ store: INFO
14
+ gateway: DEBUG
15
+ engine: DEBUG
16
+ merge: DEBUG
17
+ translog: DEBUG
18
+ cluster:
19
+ service: DEBUG
20
+ action:
21
+ shard: DEBUG
22
+ gateway: DEBUG
23
+ discovery: DEBUG
24
+ jmx: DEBUG
25
+ httpclient: INFO
26
+ node: DEBUG
27
+ plugins: DEBUG
28
+
29
+ appender:
30
+ console:
31
+ type: console
32
+ layout:
33
+ type: consolePattern
34
+ conversionPattern: "[%d{ABSOLUTE}][%-5p][%-25c] %m%n"
35
+
36
+ file:
37
+ type: dailyRollingFile
38
+ file: ${path.logs}/${cluster.name}.log
39
+ datePattern: "'.'yyyy-MM-dd"
40
+ layout:
41
+ type: pattern
42
+ conversionPattern: "[%d{ABSOLUTE}][%-5p][%-25c] %m%n"
43
+
@@ -0,0 +1,60 @@
1
+ #
2
+ # This file isn't read for any reason -- it's
3
+ # a dumping ground for annotated config sections
4
+ #
5
+
6
+
7
+ gateway:
8
+ # Settings for gateway.type = s3
9
+ s3:
10
+ bucket: infochimps-elasticsearch
11
+
12
+ gateway:
13
+ fs:
14
+ # By default, uses the 'path.work' directory Note, the work directory is
15
+ # considered a temporal directory with ElasticSearch (meaning it is safe
16
+ # to rm -rf it), the default location of the persistent gateway in work
17
+ # intentional, it should be changed.
18
+ #
19
+ # When explicitly specifying the gateway.fs.location, each node will
20
+ # append its cluster.name to the provided location. It means that the
21
+ # location provided can safely support several clusters.
22
+ #
23
+ # The file system gateway automatically sets for each index created to use
24
+ # an fs index gateway. The location specified using gateway.fs.location
25
+ # will automatically be used in this case to store index level data
26
+ # (appended by the index name).
27
+ location: /mnt2/elasticsearch/fs
28
+
29
+ discovery:
30
+
31
+ zen:
32
+ # == How should gossip be conducted?
33
+ ping:
34
+ multicast:
35
+ enabled: false
36
+ # group: 224.2.2.4
37
+ # port: 54328
38
+ # ttl: 3
39
+ # address: null
40
+ unicast:
41
+ # # Either a YAML array or a comma delimited string.
42
+ # # Each value is either in the form of host:port, or in the form of host[port1-port2].
43
+ # hosts:
44
+ # == Zen master election:
45
+ # As part of the initial ping process a master of the cluster is either
46
+ # elected or joined to. This is done automatically. The
47
+ # discovery.zen.initial_ping_timeout (which defaults to 3s) allows to
48
+ # configure the election to handle cases of slow or congested networks
49
+ # (higher values assure less chance of failure).
50
+ initial_ping_timeout: 3s
51
+ # # Allow node to become master? Note, once a node is a client node
52
+ # # (node.client = true), it will not be allowed to become a master
53
+ # # (zen.master is automatically set to false).
54
+ # master: ~
55
+ # == Zen Fault detection:
56
+ fd:
57
+ ping_interval: 1s
58
+ ping_timeout: 30s
59
+ ping_retries 3
60
+
@@ -0,0 +1,42 @@
1
+ #!/usr/bin/env bash
2
+
3
+ #
4
+ # This lets you run multiple daemons on the same machine. It points each
5
+ # daemon's data to /mnt$node/elasticsearch -- so running it with node='' will
6
+ # write to /mnt/elasticsearch, node=3 will write to /mnt3/elasticsearch.
7
+ #
8
+ # Usage:
9
+ #
10
+ # sudo node=$node ES_MAX_MEM=1800m ./config/run_elasticsearch-2.sh ; done
11
+ #
12
+ # To run multiple nodes:
13
+ #
14
+ # for node in '' 2 3 ; do sudo node=$node ES_MAX_MEM=1800m ./config/run_elasticsearch-2.sh ; done
15
+ #
16
+
17
+ # Which node?
18
+ node=${node-''}
19
+ echo "Running elasticsearch with node=$node"
20
+
21
+ # Where does elasticsearch live?
22
+ export ES_HOME=/usr/local/share/elasticsearch
23
+ export ES_CONF_DIR=/etc/elasticsearch
24
+ export ES_INCLUDE=$ES_CONF_DIR/elasticsearch.in.sh
25
+
26
+ # Where does data live?
27
+ ES_DATA_ROOT=/mnt$node/elasticsearch
28
+ export ES_DATA_DIR=$ES_DATA_ROOT/data
29
+ export ES_WORK_DIR=$ES_DATA_ROOT/work
30
+
31
+ # bump the # of open files way way up
32
+ ulimit -n 65536
33
+ # allow elasticsearch to lock itself into memory if JNA is installed
34
+ ulimit -l unlimited
35
+
36
+ # Force the heap size
37
+ export ES_MAX_MEM=${ES_MAX_MEM-1800m}
38
+ export ES_MIN_MEM=$ES_MAX_MEM
39
+
40
+ exec chpst -u elasticsearch $ES_HOME/bin/elasticsearch \
41
+ -Des.config=/etc/elasticsearch/elasticsearch.yml \
42
+ -p /var/run/elasticsearch/es-$node.pid
@@ -0,0 +1,12 @@
1
+ {
2
+ "ufo_sighting" : {
3
+ "properties" : {
4
+ "sighted_at" : {"type" : "string", "store" : "yes"},
5
+ "reported_at" : {"type" : "string", "store" : "yes"},
6
+ "location" : {"type" : "string", "store" : "yes"},
7
+ "shape" : {"type" : "string", "store" : "yes"},
8
+ "duration" : {"type" : "string", "store" : "yes"},
9
+ "description" : {"type" : "string", "store" : "yes"}
10
+ }
11
+ }
12
+ }
@@ -0,0 +1,14 @@
1
+ require 'wukong-hadoop'
2
+
3
+ module Wukong
4
+
5
+ # Wonderdog provides Java code that couples Hadoop streaming to
6
+ # Wukong. This module adds some overrides which enables the
7
+ # <tt>wu-hadoop</tt> program to leverage this code.
8
+ module Elasticsearch
9
+ end
10
+ end
11
+
12
+ require 'wonderdog/configuration'
13
+ require 'wonderdog/hadoop_invocation_override'
14
+ require 'wonderdog/timestamp'
@@ -0,0 +1,25 @@
1
+ module Wukong
2
+ module Elasticsearch
3
+
4
+ # Configure the given +settings+ to be able to work with
5
+ # Elasticsearch.
6
+ #
7
+ # @param [Configliere::Param] settings
8
+ # @return [Configliere::Param] the newly configured settings
9
+ def self.configure settings
10
+ settings.define(:es_tmp_dir, :description => "Temporary directory on the HDFS to store job files while reading/writing to ElasticSearch", :default => "/user/#{ENV['USER']}/wukong", :wukong_hadoop => true)
11
+ settings.define(:es_config, :description => "Where to find configuration files detailing how to join an ElasticSearch cluster", :wukong_hadoop => true)
12
+ settings.define(:es_input_splits, :description => "Number of input splits to target when reading from ElasticSearch", :type => Integer, :wukong_hadoop => true)
13
+ settings.define(:es_request_size, :description => "Number of objects requested during each batch read from ElasticSearch", :type => Integer, :wukong_hadoop => true)
14
+ settings.define(:es_scroll_timeout, :description => "Amount of time to wait on a scroll", :wukong_hadoop => true)
15
+ settings.define(:es_index_field, :description => "Field to use from each record to override the default index", :wukong_hadoop => true)
16
+ settings.define(:es_mapping_field, :description => "Field to use from each record to override the default mapping", :wukong_hadoop => true)
17
+ settings.define(:es_id_field, :description => "If this field is present in a record, make an update request, otherwise make a create request", :wukong_hadoop => true)
18
+ settings.define(:es_bulk_size, :description => "Number of requests to batch locally before making a request to ElasticSearch", :type => Integer, :wukong_hadoop => true)
19
+ settings.define(:es_query, :description => "Query to use when defining input splits for ElasticSearch input", :wukong_hadoop => true)
20
+
21
+ settings
22
+ end
23
+ end
24
+
25
+ end
@@ -0,0 +1,139 @@
1
+ require_relative("index_and_mapping")
2
+
3
+ module Wukong
4
+ module Elasticsearch
5
+
6
+ # This module overrides some methods defined in
7
+ # Wukong::Hadoop::HadoopInvocation. The overrides will only come
8
+ # into play if the job's input or output paths are URIs beginning
9
+ # with 'es://', implying reading or writing to/from Elasticsearch
10
+ # indices.
11
+ module HadoopInvocationOverride
12
+
13
+ # The input format when reading from Elasticsearch as defined in
14
+ # the Java code accompanying Wonderdog.
15
+ #
16
+ # @param [String]
17
+ ES_STREAMING_INPUT_FORMAT = "com.infochimps.elasticsearch.ElasticSearchStreamingInputFormat"
18
+
19
+ # The output format when writing to Elasticsearch as defined in
20
+ # the Java code accompanying Wonderdog.
21
+ #
22
+ # @param [String]
23
+ ES_STREAMING_OUTPUT_FORMAT = "com.infochimps.elasticsearch.ElasticSearchStreamingOutputFormat"
24
+
25
+ # Does this job read from Elasticsearch?
26
+ #
27
+ # @return [true, false]
28
+ def reads_from_elasticsearch?
29
+ IndexAndMapping.matches?(settings[:input])
30
+ end
31
+
32
+ # The input format to use for this job.
33
+ #
34
+ # Will override the default value to ES_STREAMING_INPUT_FORMAT if
35
+ # reading from Elasticsearch.
36
+ #
37
+ # @return [String]
38
+ def input_format
39
+ reads_from_elasticsearch? ? ES_STREAMING_INPUT_FORMAT : super()
40
+ end
41
+
42
+ # The input index to use.
43
+ #
44
+ # @return [IndexAndMapping]
45
+ def input_index
46
+ @input_index ||= IndexAndMapping.new(settings[:input])
47
+ end
48
+
49
+ # The input paths to use for this job.
50
+ #
51
+ # Will override the default value with a temporary HDFS path
52
+ # when reading from Elasticsearch.
53
+ #
54
+ # @return [String]
55
+ def input_paths
56
+ reads_from_elasticsearch? ? elasticsearch_hdfs_tmp_dir(input_index) : super()
57
+ end
58
+
59
+ # Does this write to Elasticsearch?
60
+ #
61
+ # @return [true, false]
62
+ def writes_to_elasticsearch?
63
+ IndexAndMapping.matches?(settings[:output])
64
+ end
65
+
66
+ # The output format to use for this job.
67
+ #
68
+ # Will override the default value to ES_STREAMING_OUTPUT_FORMAT if
69
+ # writing to Elasticsearch.
70
+ #
71
+ # @return [String]
72
+ def output_format
73
+ writes_to_elasticsearch? ? ES_STREAMING_OUTPUT_FORMAT : super()
74
+ end
75
+
76
+ # The output index to use.
77
+ #
78
+ # @return [IndexAndMapping]
79
+ def output_index
80
+ @output_index ||= IndexAndMapping.new(settings[:output])
81
+ end
82
+
83
+ # The output path to use for this job.
84
+ #
85
+ # Will override the default value with a temporary HDFS path
86
+ # when writing to Elasticsearch.
87
+ #
88
+ # @return [String]
89
+ def output_path
90
+ writes_to_elasticsearch? ? elasticsearch_hdfs_tmp_dir(output_index) : super()
91
+ end
92
+
93
+ # Adds Java options required to interact with the input/output
94
+ # formats defined by the Java code accompanying Wonderdog.
95
+ #
96
+ # Will not change the default Hadoop jobconf options unless it
97
+ # has to.
98
+ #
99
+ # @return [Array<String>]
100
+ def hadoop_jobconf_options
101
+ super() + [].tap do |o|
102
+ o << java_opt('es.config', settings[:es_config]) if (reads_from_elasticsearch? || writes_to_elasticsearch?)
103
+
104
+ if reads_from_elasticsearch?
105
+ o << java_opt('elasticsearch.input.index', input_index.index)
106
+ o << java_opt('elasticsearch.input.mapping', input_index.mapping)
107
+ o << java_opt('elasticsearch.input.splits', settings[:es_input_splits])
108
+ o << java_opt('elasticsearch.input.query', settings[:es_query])
109
+ o << java_opt('elasticsearch.input.request_size', settings[:es_request_size])
110
+ o << java_opt('elasticsearch.input.scroll_timeout', settings[:es_scroll_timeout])
111
+ end
112
+
113
+ if writes_to_elasticsearch?
114
+ o << java_opt('elasticsearch.output.index', output_index.index)
115
+ o << java_opt('elasticsearch.output.mapping', output_index.mapping)
116
+ o << java_opt('elasticsearch.output.index.field', settings[:es_index_field])
117
+ o << java_opt('elasticsearch.output.mapping.field', settings[:es_mapping_field])
118
+ o << java_opt('elasticsearch.output.id.field', settings[:es_id_field])
119
+ o << java_opt('elasticsearch.output.bulk_size', settings[:es_bulk_size])
120
+ end
121
+ end.flatten.compact
122
+ end
123
+
124
+ # Returns a temporary path on the HDFS in which to store log
125
+ # data while the Hadoop job runs.
126
+ #
127
+ # @param [IndexAndMapping] io
128
+ # @return [String]
129
+ def elasticsearch_hdfs_tmp_dir io
130
+ cleaner = %r{[^\w/\.\-\+]+}
131
+ io_part = [io.index, io.mapping].compact.map { |s| s.gsub(cleaner, '') }.join('/')
132
+ File.join(settings[:es_tmp_dir], io_part, Time.now.strftime("%Y-%m-%d-%H-%M-%S"))
133
+ end
134
+
135
+ end
136
+ end
137
+
138
+ Hadoop::Driver.class_eval { include Elasticsearch::HadoopInvocationOverride }
139
+ end
@@ -0,0 +1,67 @@
1
+ module Wukong
2
+ module Elasticsearch
3
+
4
+ # A convenient class for parsing Elasticsearch index and mapping URIs
5
+ # like
6
+ #
7
+ # - es://my_index
8
+ # - es://my_index/my_mapping
9
+ # - es://first_index,second_index,third_index
10
+ # - es://my_index/first_mapping,second_mapping,third_mapping
11
+ class IndexAndMapping
12
+
13
+ # A regular expression that matches URIs describing an
14
+ # Elasticsearch index and/or mapping to read/write from/to.
15
+ #
16
+ # @param [Regexp]
17
+ ES_SCHEME_REGEXP = %r{^es://}
18
+
19
+ # The Elasticsearch index.
20
+ #
21
+ # @param [String]
22
+ attr_reader :index
23
+
24
+ # The Elasticsearch mapping.
25
+ #
26
+ # @param [String]
27
+ attr_reader :mapping
28
+
29
+ # Does the given +string+ look like a possible Elasticsearch
30
+ # /index/mapping specification?
31
+ #
32
+ # @param [String] string
33
+ # @return [true, false]
34
+ def self.matches? string
35
+ return false unless string
36
+ string =~ ES_SCHEME_REGEXP
37
+ end
38
+
39
+ # Create a new index and mapping specification from the given
40
+ # +uri..
41
+ #
42
+ # @param [String] uri
43
+ def initialize uri
44
+ self.uri = uri
45
+ end
46
+
47
+ # Set the URI of this index and mapping specification, parsing it
48
+ # for an index and mapping.
49
+ #
50
+ # Will raise an error if the given URI is malformed.
51
+ #
52
+ # @param [String] uri
53
+ def uri= uri
54
+ raise Wukong::Error.new("'#{uri}' is not an ElasticSearch es://index/mapping specification") unless self.class.matches?(uri)
55
+ parts = uri.gsub(ES_SCHEME_REGEXP, '').gsub(/^\/+/,'').gsub(/\/+$/,'').split('/')
56
+
57
+ raise Wukong::Error.new("'#{uri}' is not an ElasticSearch es://index/mapping specification") unless parts.size.between?(1,2)
58
+
59
+ @index = parts[0]
60
+ @mapping = parts[1]
61
+ end
62
+ end
63
+ end
64
+ end
65
+
66
+
67
+