wonderdog 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. data/.gitignore +49 -0
  2. data/.rspec +2 -0
  3. data/CHANGELOG.md +5 -0
  4. data/LICENSE.md +201 -0
  5. data/README.md +175 -0
  6. data/Rakefile +10 -0
  7. data/bin/estool +141 -0
  8. data/bin/estrus.rb +136 -0
  9. data/bin/wonderdog +93 -0
  10. data/config/elasticsearch-example.yml +227 -0
  11. data/config/elasticsearch.in.sh +52 -0
  12. data/config/logging.yml +43 -0
  13. data/config/more_settings.yml +60 -0
  14. data/config/run_elasticsearch-2.sh +42 -0
  15. data/config/ufo_config.json +12 -0
  16. data/lib/wonderdog.rb +14 -0
  17. data/lib/wonderdog/configuration.rb +25 -0
  18. data/lib/wonderdog/hadoop_invocation_override.rb +139 -0
  19. data/lib/wonderdog/index_and_mapping.rb +67 -0
  20. data/lib/wonderdog/timestamp.rb +43 -0
  21. data/lib/wonderdog/version.rb +3 -0
  22. data/notes/README-benchmarking.txt +272 -0
  23. data/notes/README-read_tuning.textile +74 -0
  24. data/notes/benchmarking-201011.numbers +0 -0
  25. data/notes/cluster_notes.md +17 -0
  26. data/notes/notes.txt +91 -0
  27. data/notes/pigstorefunc.pig +45 -0
  28. data/pom.xml +80 -0
  29. data/spec/spec_helper.rb +22 -0
  30. data/spec/support/driver_helper.rb +15 -0
  31. data/spec/support/integration_helper.rb +30 -0
  32. data/spec/wonderdog/hadoop_invocation_override_spec.rb +81 -0
  33. data/spec/wonderdog/index_and_type_spec.rb +73 -0
  34. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchInputFormat.java +268 -0
  35. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchOutputCommitter.java +39 -0
  36. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchOutputFormat.java +283 -0
  37. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchSplit.java +60 -0
  38. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingInputFormat.java +231 -0
  39. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingOutputCommitter.java +37 -0
  40. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingOutputFormat.java +88 -0
  41. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingRecordReader.java +176 -0
  42. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingRecordWriter.java +171 -0
  43. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingSplit.java +102 -0
  44. data/src/main/java/com/infochimps/elasticsearch/ElasticTest.java +108 -0
  45. data/src/main/java/com/infochimps/elasticsearch/hadoop/util/HadoopUtils.java +100 -0
  46. data/src/main/java/com/infochimps/elasticsearch/pig/ElasticSearchIndex.java +216 -0
  47. data/src/main/java/com/infochimps/elasticsearch/pig/ElasticSearchJsonIndex.java +235 -0
  48. data/src/main/java/com/infochimps/elasticsearch/pig/ElasticSearchStorage.java +355 -0
  49. data/test/foo.json +3 -0
  50. data/test/foo.tsv +3 -0
  51. data/test/test_dump.pig +19 -0
  52. data/test/test_json_loader.pig +21 -0
  53. data/test/test_tsv_loader.pig +16 -0
  54. data/wonderdog.gemspec +32 -0
  55. metadata +130 -0
@@ -0,0 +1,52 @@
1
+ export ES_CONF_DIR=${ES_CONF_DIR-/etc/elasticsearch}
2
+ export ES_WORK_DIR=${ES_WORK_DIR-/mnt/elasticsearch/work}
3
+ export ES_DATA_DIR=${ES_DATA_DIR-/mnt/elasticsearch/data}
4
+
5
+ export CLASSPATH=$ES_HOME/plugins/cloud-aws.zip
6
+ CLASSPATH=$CLASSPATH:$ES_HOME/lib/elasticsearch-0.11.0.jar:$ES_HOME/lib/*:$ES_HOME/lib/sigar/*
7
+
8
+ # bump the # of open files way way up
9
+ ulimit -n 65536
10
+ # allow elasticsearch to lock itself into memory if JNA is installed
11
+ ulimit -l unlimited
12
+
13
+ if [ "x$ES_MIN_MEM" = "x" ]; then
14
+ ES_MIN_MEM=256m
15
+ fi
16
+ if [ "x$ES_MAX_MEM" = "x" ]; then
17
+ ES_MAX_MEM=1500m
18
+ fi
19
+
20
+ # Arguments to pass to the JVM
21
+ JAVA_OPTS="$JAVA_OPTS -Xms${ES_MIN_MEM}"
22
+ JAVA_OPTS="$JAVA_OPTS -Xmx${ES_MAX_MEM}"
23
+ JAVA_OPTS="$JAVA_OPTS -Xss128k"
24
+
25
+ JAVA_OPTS="$JAVA_OPTS -Djline.enabled=true"
26
+
27
+ JAVA_OPTS="$JAVA_OPTS -XX:+AggressiveOpts"
28
+
29
+ JAVA_OPTS="$JAVA_OPTS -XX:+UseParNewGC"
30
+ JAVA_OPTS="$JAVA_OPTS -XX:+UseConcMarkSweepGC"
31
+ JAVA_OPTS="$JAVA_OPTS -XX:+CMSParallelRemarkEnabled"
32
+ JAVA_OPTS="$JAVA_OPTS -XX:SurvivorRatio=8"
33
+ JAVA_OPTS="$JAVA_OPTS -XX:MaxTenuringThreshold=1"
34
+ JAVA_OPTS="$JAVA_OPTS -XX:+HeapDumpOnOutOfMemoryError"
35
+ JAVA_OPTS="$JAVA_OPTS -XX:HeapDumpPath=$ES_WORK_DIR/heap"
36
+ JAVA_OPTS="$JAVA_OPTS -XX:+PrintGCTimeStamps -XX:+PrintTenuringDistribution -XX:+TraceClassUnloading -XX:+PrintGCDetails -verbose:gc -Xloggc:/var/log/elasticsearch/elasticsearch-gc.log"
37
+
38
+ JAVA_OPTS="$JAVA_OPTS -XX:+UseCompressedOops" # avoid this on sun java < 1.6.0_20
39
+
40
+ # ensures JMX accessible from outside world
41
+ JAVA_OPTS="$JAVA_OPTS -Dcom.sun.management.jmxremote.ssl=false -Dcom.sun.management.jmxremote.authenticate=false -Djava.rmi.server.hostname=ec2-184-73-69-18.compute-1.amazonaws.com "
42
+
43
+ # More options to consider LATER
44
+ # java.net.preferIPv4Stack=true: Better OOTB experience, especially with jgroups
45
+ # -XX:CMSInitiatingOccupancyFraction=88
46
+
47
+ ES_JAVA_OPTS="$ES_JAVA_OPTS -Des.path.data=$ES_DATA_DIR -Des.path.work=$ES_WORK_DIR"
48
+
49
+ echo JAVA_OPTS="'$JAVA_OPTS'"
50
+ echo ES_JAVA_OPTS="'$ES_JAVA_OPTS'"
51
+
52
+ export JAVA_OPTS ES_JAVA_OPTS ES_MAX_MEM ES_MIN_MEM
@@ -0,0 +1,43 @@
1
+ rootLogger: DEBUG, console, file
2
+
3
+ #
4
+ # Put the name of any module -- using its config path -- in the section below.
5
+ #
6
+ logger:
7
+ # log action execution errors for easier debugging
8
+ action : DEBUG
9
+
10
+ index:
11
+ shard:
12
+ recovery: DEBUG
13
+ store: INFO
14
+ gateway: DEBUG
15
+ engine: DEBUG
16
+ merge: DEBUG
17
+ translog: DEBUG
18
+ cluster:
19
+ service: DEBUG
20
+ action:
21
+ shard: DEBUG
22
+ gateway: DEBUG
23
+ discovery: DEBUG
24
+ jmx: DEBUG
25
+ httpclient: INFO
26
+ node: DEBUG
27
+ plugins: DEBUG
28
+
29
+ appender:
30
+ console:
31
+ type: console
32
+ layout:
33
+ type: consolePattern
34
+ conversionPattern: "[%d{ABSOLUTE}][%-5p][%-25c] %m%n"
35
+
36
+ file:
37
+ type: dailyRollingFile
38
+ file: ${path.logs}/${cluster.name}.log
39
+ datePattern: "'.'yyyy-MM-dd"
40
+ layout:
41
+ type: pattern
42
+ conversionPattern: "[%d{ABSOLUTE}][%-5p][%-25c] %m%n"
43
+
@@ -0,0 +1,60 @@
1
+ #
2
+ # This file isn't read for any reason -- it's
3
+ # a dumping ground for annotated config sections
4
+ #
5
+
6
+
7
+ gateway:
8
+ # Settings for gateway.type = s3
9
+ s3:
10
+ bucket: infochimps-elasticsearch
11
+
12
+ gateway:
13
+ fs:
14
+ # By default, uses the 'path.work' directory Note, the work directory is
15
+ # considered a temporal directory with ElasticSearch (meaning it is safe
16
+ # to rm -rf it), the default location of the persistent gateway in work
17
+ # intentional, it should be changed.
18
+ #
19
+ # When explicitly specifying the gateway.fs.location, each node will
20
+ # append its cluster.name to the provided location. It means that the
21
+ # location provided can safely support several clusters.
22
+ #
23
+ # The file system gateway automatically sets for each index created to use
24
+ # an fs index gateway. The location specified using gateway.fs.location
25
+ # will automatically be used in this case to store index level data
26
+ # (appended by the index name).
27
+ location: /mnt2/elasticsearch/fs
28
+
29
+ discovery:
30
+
31
+ zen:
32
+ # == How should gossip be conducted?
33
+ ping:
34
+ multicast:
35
+ enabled: false
36
+ # group: 224.2.2.4
37
+ # port: 54328
38
+ # ttl: 3
39
+ # address: null
40
+ unicast:
41
+ # # Either a YAML array or a comma delimited string.
42
+ # # Each value is either in the form of host:port, or in the form of host[port1-port2].
43
+ # hosts:
44
+ # == Zen master election:
45
+ # As part of the initial ping process a master of the cluster is either
46
+ # elected or joined to. This is done automatically. The
47
+ # discovery.zen.initial_ping_timeout (which defaults to 3s) allows to
48
+ # configure the election to handle cases of slow or congested networks
49
+ # (higher values assure less chance of failure).
50
+ initial_ping_timeout: 3s
51
+ # # Allow node to become master? Note, once a node is a client node
52
+ # # (node.client = true), it will not be allowed to become a master
53
+ # # (zen.master is automatically set to false).
54
+ # master: ~
55
+ # == Zen Fault detection:
56
+ fd:
57
+ ping_interval: 1s
58
+ ping_timeout: 30s
59
+ ping_retries 3
60
+
@@ -0,0 +1,42 @@
1
+ #!/usr/bin/env bash
2
+
3
+ #
4
+ # This lets you run multiple daemons on the same machine. It points each
5
+ # daemon's data to /mnt$node/elasticsearch -- so running it with node='' will
6
+ # write to /mnt/elasticsearch, node=3 will write to /mnt3/elasticsearch.
7
+ #
8
+ # Usage:
9
+ #
10
+ # sudo node=$node ES_MAX_MEM=1800m ./config/run_elasticsearch-2.sh ; done
11
+ #
12
+ # To run multiple nodes:
13
+ #
14
+ # for node in '' 2 3 ; do sudo node=$node ES_MAX_MEM=1800m ./config/run_elasticsearch-2.sh ; done
15
+ #
16
+
17
+ # Which node?
18
+ node=${node-''}
19
+ echo "Running elasticsearch with node=$node"
20
+
21
+ # Where does elasticsearch live?
22
+ export ES_HOME=/usr/local/share/elasticsearch
23
+ export ES_CONF_DIR=/etc/elasticsearch
24
+ export ES_INCLUDE=$ES_CONF_DIR/elasticsearch.in.sh
25
+
26
+ # Where does data live?
27
+ ES_DATA_ROOT=/mnt$node/elasticsearch
28
+ export ES_DATA_DIR=$ES_DATA_ROOT/data
29
+ export ES_WORK_DIR=$ES_DATA_ROOT/work
30
+
31
+ # bump the # of open files way way up
32
+ ulimit -n 65536
33
+ # allow elasticsearch to lock itself into memory if JNA is installed
34
+ ulimit -l unlimited
35
+
36
+ # Force the heap size
37
+ export ES_MAX_MEM=${ES_MAX_MEM-1800m}
38
+ export ES_MIN_MEM=$ES_MAX_MEM
39
+
40
+ exec chpst -u elasticsearch $ES_HOME/bin/elasticsearch \
41
+ -Des.config=/etc/elasticsearch/elasticsearch.yml \
42
+ -p /var/run/elasticsearch/es-$node.pid
@@ -0,0 +1,12 @@
1
+ {
2
+ "ufo_sighting" : {
3
+ "properties" : {
4
+ "sighted_at" : {"type" : "string", "store" : "yes"},
5
+ "reported_at" : {"type" : "string", "store" : "yes"},
6
+ "location" : {"type" : "string", "store" : "yes"},
7
+ "shape" : {"type" : "string", "store" : "yes"},
8
+ "duration" : {"type" : "string", "store" : "yes"},
9
+ "description" : {"type" : "string", "store" : "yes"}
10
+ }
11
+ }
12
+ }
@@ -0,0 +1,14 @@
1
+ require 'wukong-hadoop'
2
+
3
+ module Wukong
4
+
5
+ # Wonderdog provides Java code that couples Hadoop streaming to
6
+ # Wukong. This module adds some overrides which enables the
7
+ # <tt>wu-hadoop</tt> program to leverage this code.
8
+ module Elasticsearch
9
+ end
10
+ end
11
+
12
+ require 'wonderdog/configuration'
13
+ require 'wonderdog/hadoop_invocation_override'
14
+ require 'wonderdog/timestamp'
@@ -0,0 +1,25 @@
1
+ module Wukong
2
+ module Elasticsearch
3
+
4
+ # Configure the given +settings+ to be able to work with
5
+ # Elasticsearch.
6
+ #
7
+ # @param [Configliere::Param] settings
8
+ # @return [Configliere::Param] the newly configured settings
9
+ def self.configure settings
10
+ settings.define(:es_tmp_dir, :description => "Temporary directory on the HDFS to store job files while reading/writing to ElasticSearch", :default => "/user/#{ENV['USER']}/wukong", :wukong_hadoop => true)
11
+ settings.define(:es_config, :description => "Where to find configuration files detailing how to join an ElasticSearch cluster", :wukong_hadoop => true)
12
+ settings.define(:es_input_splits, :description => "Number of input splits to target when reading from ElasticSearch", :type => Integer, :wukong_hadoop => true)
13
+ settings.define(:es_request_size, :description => "Number of objects requested during each batch read from ElasticSearch", :type => Integer, :wukong_hadoop => true)
14
+ settings.define(:es_scroll_timeout, :description => "Amount of time to wait on a scroll", :wukong_hadoop => true)
15
+ settings.define(:es_index_field, :description => "Field to use from each record to override the default index", :wukong_hadoop => true)
16
+ settings.define(:es_mapping_field, :description => "Field to use from each record to override the default mapping", :wukong_hadoop => true)
17
+ settings.define(:es_id_field, :description => "If this field is present in a record, make an update request, otherwise make a create request", :wukong_hadoop => true)
18
+ settings.define(:es_bulk_size, :description => "Number of requests to batch locally before making a request to ElasticSearch", :type => Integer, :wukong_hadoop => true)
19
+ settings.define(:es_query, :description => "Query to use when defining input splits for ElasticSearch input", :wukong_hadoop => true)
20
+
21
+ settings
22
+ end
23
+ end
24
+
25
+ end
@@ -0,0 +1,139 @@
1
+ require_relative("index_and_mapping")
2
+
3
+ module Wukong
4
+ module Elasticsearch
5
+
6
+ # This module overrides some methods defined in
7
+ # Wukong::Hadoop::HadoopInvocation. The overrides will only come
8
+ # into play if the job's input or output paths are URIs beginning
9
+ # with 'es://', implying reading or writing to/from Elasticsearch
10
+ # indices.
11
+ module HadoopInvocationOverride
12
+
13
+ # The input format when reading from Elasticsearch as defined in
14
+ # the Java code accompanying Wonderdog.
15
+ #
16
+ # @param [String]
17
+ ES_STREAMING_INPUT_FORMAT = "com.infochimps.elasticsearch.ElasticSearchStreamingInputFormat"
18
+
19
+ # The output format when writing to Elasticsearch as defined in
20
+ # the Java code accompanying Wonderdog.
21
+ #
22
+ # @param [String]
23
+ ES_STREAMING_OUTPUT_FORMAT = "com.infochimps.elasticsearch.ElasticSearchStreamingOutputFormat"
24
+
25
+ # Does this job read from Elasticsearch?
26
+ #
27
+ # @return [true, false]
28
+ def reads_from_elasticsearch?
29
+ IndexAndMapping.matches?(settings[:input])
30
+ end
31
+
32
+ # The input format to use for this job.
33
+ #
34
+ # Will override the default value to ES_STREAMING_INPUT_FORMAT if
35
+ # reading from Elasticsearch.
36
+ #
37
+ # @return [String]
38
+ def input_format
39
+ reads_from_elasticsearch? ? ES_STREAMING_INPUT_FORMAT : super()
40
+ end
41
+
42
+ # The input index to use.
43
+ #
44
+ # @return [IndexAndMapping]
45
+ def input_index
46
+ @input_index ||= IndexAndMapping.new(settings[:input])
47
+ end
48
+
49
+ # The input paths to use for this job.
50
+ #
51
+ # Will override the default value with a temporary HDFS path
52
+ # when reading from Elasticsearch.
53
+ #
54
+ # @return [String]
55
+ def input_paths
56
+ reads_from_elasticsearch? ? elasticsearch_hdfs_tmp_dir(input_index) : super()
57
+ end
58
+
59
+ # Does this write to Elasticsearch?
60
+ #
61
+ # @return [true, false]
62
+ def writes_to_elasticsearch?
63
+ IndexAndMapping.matches?(settings[:output])
64
+ end
65
+
66
+ # The output format to use for this job.
67
+ #
68
+ # Will override the default value to ES_STREAMING_OUTPUT_FORMAT if
69
+ # writing to Elasticsearch.
70
+ #
71
+ # @return [String]
72
+ def output_format
73
+ writes_to_elasticsearch? ? ES_STREAMING_OUTPUT_FORMAT : super()
74
+ end
75
+
76
+ # The output index to use.
77
+ #
78
+ # @return [IndexAndMapping]
79
+ def output_index
80
+ @output_index ||= IndexAndMapping.new(settings[:output])
81
+ end
82
+
83
+ # The output path to use for this job.
84
+ #
85
+ # Will override the default value with a temporary HDFS path
86
+ # when writing to Elasticsearch.
87
+ #
88
+ # @return [String]
89
+ def output_path
90
+ writes_to_elasticsearch? ? elasticsearch_hdfs_tmp_dir(output_index) : super()
91
+ end
92
+
93
+ # Adds Java options required to interact with the input/output
94
+ # formats defined by the Java code accompanying Wonderdog.
95
+ #
96
+ # Will not change the default Hadoop jobconf options unless it
97
+ # has to.
98
+ #
99
+ # @return [Array<String>]
100
+ def hadoop_jobconf_options
101
+ super() + [].tap do |o|
102
+ o << java_opt('es.config', settings[:es_config]) if (reads_from_elasticsearch? || writes_to_elasticsearch?)
103
+
104
+ if reads_from_elasticsearch?
105
+ o << java_opt('elasticsearch.input.index', input_index.index)
106
+ o << java_opt('elasticsearch.input.mapping', input_index.mapping)
107
+ o << java_opt('elasticsearch.input.splits', settings[:es_input_splits])
108
+ o << java_opt('elasticsearch.input.query', settings[:es_query])
109
+ o << java_opt('elasticsearch.input.request_size', settings[:es_request_size])
110
+ o << java_opt('elasticsearch.input.scroll_timeout', settings[:es_scroll_timeout])
111
+ end
112
+
113
+ if writes_to_elasticsearch?
114
+ o << java_opt('elasticsearch.output.index', output_index.index)
115
+ o << java_opt('elasticsearch.output.mapping', output_index.mapping)
116
+ o << java_opt('elasticsearch.output.index.field', settings[:es_index_field])
117
+ o << java_opt('elasticsearch.output.mapping.field', settings[:es_mapping_field])
118
+ o << java_opt('elasticsearch.output.id.field', settings[:es_id_field])
119
+ o << java_opt('elasticsearch.output.bulk_size', settings[:es_bulk_size])
120
+ end
121
+ end.flatten.compact
122
+ end
123
+
124
+ # Returns a temporary path on the HDFS in which to store log
125
+ # data while the Hadoop job runs.
126
+ #
127
+ # @param [IndexAndMapping] io
128
+ # @return [String]
129
+ def elasticsearch_hdfs_tmp_dir io
130
+ cleaner = %r{[^\w/\.\-\+]+}
131
+ io_part = [io.index, io.mapping].compact.map { |s| s.gsub(cleaner, '') }.join('/')
132
+ File.join(settings[:es_tmp_dir], io_part, Time.now.strftime("%Y-%m-%d-%H-%M-%S"))
133
+ end
134
+
135
+ end
136
+ end
137
+
138
+ Hadoop::Driver.class_eval { include Elasticsearch::HadoopInvocationOverride }
139
+ end
@@ -0,0 +1,67 @@
1
+ module Wukong
2
+ module Elasticsearch
3
+
4
+ # A convenient class for parsing Elasticsearch index and mapping URIs
5
+ # like
6
+ #
7
+ # - es://my_index
8
+ # - es://my_index/my_mapping
9
+ # - es://first_index,second_index,third_index
10
+ # - es://my_index/first_mapping,second_mapping,third_mapping
11
+ class IndexAndMapping
12
+
13
+ # A regular expression that matches URIs describing an
14
+ # Elasticsearch index and/or mapping to read/write from/to.
15
+ #
16
+ # @param [Regexp]
17
+ ES_SCHEME_REGEXP = %r{^es://}
18
+
19
+ # The Elasticsearch index.
20
+ #
21
+ # @param [String]
22
+ attr_reader :index
23
+
24
+ # The Elasticsearch mapping.
25
+ #
26
+ # @param [String]
27
+ attr_reader :mapping
28
+
29
+ # Does the given +string+ look like a possible Elasticsearch
30
+ # /index/mapping specification?
31
+ #
32
+ # @param [String] string
33
+ # @return [true, false]
34
+ def self.matches? string
35
+ return false unless string
36
+ string =~ ES_SCHEME_REGEXP
37
+ end
38
+
39
+ # Create a new index and mapping specification from the given
40
+ # +uri..
41
+ #
42
+ # @param [String] uri
43
+ def initialize uri
44
+ self.uri = uri
45
+ end
46
+
47
+ # Set the URI of this index and mapping specification, parsing it
48
+ # for an index and mapping.
49
+ #
50
+ # Will raise an error if the given URI is malformed.
51
+ #
52
+ # @param [String] uri
53
+ def uri= uri
54
+ raise Wukong::Error.new("'#{uri}' is not an ElasticSearch es://index/mapping specification") unless self.class.matches?(uri)
55
+ parts = uri.gsub(ES_SCHEME_REGEXP, '').gsub(/^\/+/,'').gsub(/\/+$/,'').split('/')
56
+
57
+ raise Wukong::Error.new("'#{uri}' is not an ElasticSearch es://index/mapping specification") unless parts.size.between?(1,2)
58
+
59
+ @index = parts[0]
60
+ @mapping = parts[1]
61
+ end
62
+ end
63
+ end
64
+ end
65
+
66
+
67
+