wonderdog 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +49 -0
- data/.rspec +2 -0
- data/CHANGELOG.md +5 -0
- data/LICENSE.md +201 -0
- data/README.md +175 -0
- data/Rakefile +10 -0
- data/bin/estool +141 -0
- data/bin/estrus.rb +136 -0
- data/bin/wonderdog +93 -0
- data/config/elasticsearch-example.yml +227 -0
- data/config/elasticsearch.in.sh +52 -0
- data/config/logging.yml +43 -0
- data/config/more_settings.yml +60 -0
- data/config/run_elasticsearch-2.sh +42 -0
- data/config/ufo_config.json +12 -0
- data/lib/wonderdog.rb +14 -0
- data/lib/wonderdog/configuration.rb +25 -0
- data/lib/wonderdog/hadoop_invocation_override.rb +139 -0
- data/lib/wonderdog/index_and_mapping.rb +67 -0
- data/lib/wonderdog/timestamp.rb +43 -0
- data/lib/wonderdog/version.rb +3 -0
- data/notes/README-benchmarking.txt +272 -0
- data/notes/README-read_tuning.textile +74 -0
- data/notes/benchmarking-201011.numbers +0 -0
- data/notes/cluster_notes.md +17 -0
- data/notes/notes.txt +91 -0
- data/notes/pigstorefunc.pig +45 -0
- data/pom.xml +80 -0
- data/spec/spec_helper.rb +22 -0
- data/spec/support/driver_helper.rb +15 -0
- data/spec/support/integration_helper.rb +30 -0
- data/spec/wonderdog/hadoop_invocation_override_spec.rb +81 -0
- data/spec/wonderdog/index_and_type_spec.rb +73 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchInputFormat.java +268 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchOutputCommitter.java +39 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchOutputFormat.java +283 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchSplit.java +60 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingInputFormat.java +231 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingOutputCommitter.java +37 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingOutputFormat.java +88 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingRecordReader.java +176 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingRecordWriter.java +171 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingSplit.java +102 -0
- data/src/main/java/com/infochimps/elasticsearch/ElasticTest.java +108 -0
- data/src/main/java/com/infochimps/elasticsearch/hadoop/util/HadoopUtils.java +100 -0
- data/src/main/java/com/infochimps/elasticsearch/pig/ElasticSearchIndex.java +216 -0
- data/src/main/java/com/infochimps/elasticsearch/pig/ElasticSearchJsonIndex.java +235 -0
- data/src/main/java/com/infochimps/elasticsearch/pig/ElasticSearchStorage.java +355 -0
- data/test/foo.json +3 -0
- data/test/foo.tsv +3 -0
- data/test/test_dump.pig +19 -0
- data/test/test_json_loader.pig +21 -0
- data/test/test_tsv_loader.pig +16 -0
- data/wonderdog.gemspec +32 -0
- metadata +130 -0
data/test/foo.json
ADDED
data/test/foo.tsv
ADDED
data/test/test_dump.pig
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
--
|
2
|
+
-- This tests loading data from elasticsearch
|
3
|
+
--
|
4
|
+
|
5
|
+
%default ES_JAR_DIR '/usr/local/Cellar/elasticsearch/0.18.7/libexec'
|
6
|
+
%default ES_YAML '/usr/local/Cellar/elasticsearch/0.18.7/config/elasticsearch.yml'
|
7
|
+
%default PLUGINS '/usr/local/Cellar/elasticsearch/0.18.7/plugins'
|
8
|
+
|
9
|
+
%default INDEX 'foo_test'
|
10
|
+
%default OBJ 'foo'
|
11
|
+
|
12
|
+
register $ES_JAR_DIR/*.jar;
|
13
|
+
register target/wonderdog*.jar;
|
14
|
+
|
15
|
+
--
|
16
|
+
-- Will load the data as (doc_id, contents) tuples where the contents is the original json source from elasticsearch
|
17
|
+
--
|
18
|
+
foo = LOAD 'es://$INDEX/$OBJ' USING com.infochimps.elasticsearch.pig.ElasticSearchStorage('$ES_YAML', '$PLUGINS') AS (doc_id:chararray, contents:chararray);
|
19
|
+
DUMP foo;
|
@@ -0,0 +1,21 @@
|
|
1
|
+
--
|
2
|
+
-- This tests the json indexer. Run in local mode with 'pig -x local test/test_json_loader.pig'
|
3
|
+
--
|
4
|
+
|
5
|
+
%default ES_JAR_DIR '/usr/local/Cellar/elasticsearch/0.18.7/libexec'
|
6
|
+
%default ES_YAML '/usr/local/Cellar/elasticsearch/0.18.7/config/elasticsearch.yml'
|
7
|
+
%default PLUGINS '/usr/local/Cellar/elasticsearch/0.18.7/plugins'
|
8
|
+
|
9
|
+
%default INDEX 'foo_test'
|
10
|
+
%default OBJ 'foo'
|
11
|
+
|
12
|
+
register $ES_JAR_DIR/*.jar;
|
13
|
+
register target/wonderdog*.jar;
|
14
|
+
|
15
|
+
foo = LOAD 'test/foo.json' AS (data:chararray);
|
16
|
+
|
17
|
+
--
|
18
|
+
-- Query parameters let elasticsearch output format that we're storing json data and
|
19
|
+
-- want to use a bulk request size of 1 record.
|
20
|
+
--
|
21
|
+
STORE foo INTO 'es://$INDEX/$OBJ?json=true&size=1' USING com.infochimps.elasticsearch.pig.ElasticSearchStorage('$ES_YAML', '$PLUGINS');
|
@@ -0,0 +1,16 @@
|
|
1
|
+
--
|
2
|
+
-- This tests the tsv indexer. Run in local mode with 'pig -x local test/test_tsv_loader.pig'
|
3
|
+
--
|
4
|
+
%default ES_JAR_DIR '/usr/local/Cellar/elasticsearch/0.18.7/libexec'
|
5
|
+
%default ES_YAML '/usr/local/Cellar/elasticsearch/0.18.7/config/elasticsearch.yml'
|
6
|
+
%default PLUGINS '/usr/local/Cellar/elasticsearch/0.18.7/plugins'
|
7
|
+
|
8
|
+
%default INDEX 'foo_test'
|
9
|
+
%default OBJ 'foo'
|
10
|
+
|
11
|
+
register $ES_JAR_DIR/*.jar;
|
12
|
+
register target/wonderdog*.jar;
|
13
|
+
|
14
|
+
foo = LOAD 'test/foo.tsv' AS (character:chararray, value:int);
|
15
|
+
|
16
|
+
STORE foo INTO 'es://$INDEX/$OBJ?json=false&size=1' USING com.infochimps.elasticsearch.pig.ElasticSearchStorage('$ES_YAML', '$PLUGINS');
|
data/wonderdog.gemspec
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
require File.expand_path('../lib/wonderdog/version', __FILE__)
|
3
|
+
|
4
|
+
Gem::Specification.new do |gem|
|
5
|
+
gem.name = 'wonderdog'
|
6
|
+
gem.homepage = 'https://github.com/infochimps-labs/wonderdog'
|
7
|
+
gem.licenses = ["Apache 2.0"]
|
8
|
+
gem.email = 'coders@infochimps.com'
|
9
|
+
gem.authors = ['Infochimps', 'Philip (flip) Kromer', 'Jacob Perkins', 'Travis Dempsey', 'Dhruv Bansal']
|
10
|
+
gem.version = Wonderdog::VERSION
|
11
|
+
|
12
|
+
gem.summary = 'Make Hadoop and ElasticSearch play together nicely.'
|
13
|
+
gem.description = <<-EOF
|
14
|
+
Wonderdog provides code in both Ruby and Java to make Elasticsearch
|
15
|
+
a more fully-fledged member of both the Hadoop and Wukong
|
16
|
+
ecosystems.
|
17
|
+
|
18
|
+
For the Java side, Wonderdog provides InputFormat and OutputFormat
|
19
|
+
classes for use with Hadoop (esp. Hadoop Streaming) and Pig.
|
20
|
+
|
21
|
+
For the Ruby side, Wonderdog provides extensions for wu-hadoop to
|
22
|
+
make running Hadoop Streaming jobs written in Wukong against
|
23
|
+
ElasticSearch easier.
|
24
|
+
EOF
|
25
|
+
|
26
|
+
gem.files = `git ls-files`.split("\n")
|
27
|
+
gem.executables = []
|
28
|
+
gem.test_files = gem.files.grep(/^spec/)
|
29
|
+
gem.require_paths = ['lib']
|
30
|
+
|
31
|
+
gem.add_dependency('wukong', '3.0.0.pre2')
|
32
|
+
end
|
metadata
ADDED
@@ -0,0 +1,130 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: wonderdog
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Infochimps
|
9
|
+
- Philip (flip) Kromer
|
10
|
+
- Jacob Perkins
|
11
|
+
- Travis Dempsey
|
12
|
+
- Dhruv Bansal
|
13
|
+
autorequire:
|
14
|
+
bindir: bin
|
15
|
+
cert_chain: []
|
16
|
+
date: 2012-12-01 00:00:00.000000000 Z
|
17
|
+
dependencies:
|
18
|
+
- !ruby/object:Gem::Dependency
|
19
|
+
name: wukong
|
20
|
+
requirement: !ruby/object:Gem::Requirement
|
21
|
+
none: false
|
22
|
+
requirements:
|
23
|
+
- - '='
|
24
|
+
- !ruby/object:Gem::Version
|
25
|
+
version: 3.0.0.pre2
|
26
|
+
type: :runtime
|
27
|
+
prerelease: false
|
28
|
+
version_requirements: !ruby/object:Gem::Requirement
|
29
|
+
none: false
|
30
|
+
requirements:
|
31
|
+
- - '='
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 3.0.0.pre2
|
34
|
+
description: ! " Wonderdog provides code in both Ruby and Java to make Elasticsearch\n
|
35
|
+
\ a more fully-fledged member of both the Hadoop and Wukong\n ecosystems.\n\n For
|
36
|
+
the Java side, Wonderdog provides InputFormat and OutputFormat\n classes for use
|
37
|
+
with Hadoop (esp. Hadoop Streaming) and Pig.\n\n For the Ruby side, Wonderdog provides
|
38
|
+
extensions for wu-hadoop to\n make running Hadoop Streaming jobs written in Wukong
|
39
|
+
against\n ElasticSearch easier.\n"
|
40
|
+
email: coders@infochimps.com
|
41
|
+
executables: []
|
42
|
+
extensions: []
|
43
|
+
extra_rdoc_files: []
|
44
|
+
files:
|
45
|
+
- .gitignore
|
46
|
+
- .rspec
|
47
|
+
- CHANGELOG.md
|
48
|
+
- LICENSE.md
|
49
|
+
- README.md
|
50
|
+
- Rakefile
|
51
|
+
- bin/estool
|
52
|
+
- bin/estrus.rb
|
53
|
+
- bin/wonderdog
|
54
|
+
- config/elasticsearch-example.yml
|
55
|
+
- config/elasticsearch.in.sh
|
56
|
+
- config/logging.yml
|
57
|
+
- config/more_settings.yml
|
58
|
+
- config/run_elasticsearch-2.sh
|
59
|
+
- config/ufo_config.json
|
60
|
+
- lib/wonderdog.rb
|
61
|
+
- lib/wonderdog/configuration.rb
|
62
|
+
- lib/wonderdog/hadoop_invocation_override.rb
|
63
|
+
- lib/wonderdog/index_and_mapping.rb
|
64
|
+
- lib/wonderdog/timestamp.rb
|
65
|
+
- lib/wonderdog/version.rb
|
66
|
+
- notes/README-benchmarking.txt
|
67
|
+
- notes/README-read_tuning.textile
|
68
|
+
- notes/benchmarking-201011.numbers
|
69
|
+
- notes/cluster_notes.md
|
70
|
+
- notes/notes.txt
|
71
|
+
- notes/pigstorefunc.pig
|
72
|
+
- pom.xml
|
73
|
+
- spec/spec_helper.rb
|
74
|
+
- spec/support/driver_helper.rb
|
75
|
+
- spec/support/integration_helper.rb
|
76
|
+
- spec/wonderdog/hadoop_invocation_override_spec.rb
|
77
|
+
- spec/wonderdog/index_and_type_spec.rb
|
78
|
+
- src/main/java/com/infochimps/elasticsearch/ElasticSearchInputFormat.java
|
79
|
+
- src/main/java/com/infochimps/elasticsearch/ElasticSearchOutputCommitter.java
|
80
|
+
- src/main/java/com/infochimps/elasticsearch/ElasticSearchOutputFormat.java
|
81
|
+
- src/main/java/com/infochimps/elasticsearch/ElasticSearchSplit.java
|
82
|
+
- src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingInputFormat.java
|
83
|
+
- src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingOutputCommitter.java
|
84
|
+
- src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingOutputFormat.java
|
85
|
+
- src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingRecordReader.java
|
86
|
+
- src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingRecordWriter.java
|
87
|
+
- src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingSplit.java
|
88
|
+
- src/main/java/com/infochimps/elasticsearch/ElasticTest.java
|
89
|
+
- src/main/java/com/infochimps/elasticsearch/hadoop/util/HadoopUtils.java
|
90
|
+
- src/main/java/com/infochimps/elasticsearch/pig/ElasticSearchIndex.java
|
91
|
+
- src/main/java/com/infochimps/elasticsearch/pig/ElasticSearchJsonIndex.java
|
92
|
+
- src/main/java/com/infochimps/elasticsearch/pig/ElasticSearchStorage.java
|
93
|
+
- test/foo.json
|
94
|
+
- test/foo.tsv
|
95
|
+
- test/test_dump.pig
|
96
|
+
- test/test_json_loader.pig
|
97
|
+
- test/test_tsv_loader.pig
|
98
|
+
- wonderdog.gemspec
|
99
|
+
homepage: https://github.com/infochimps-labs/wonderdog
|
100
|
+
licenses:
|
101
|
+
- Apache 2.0
|
102
|
+
post_install_message:
|
103
|
+
rdoc_options: []
|
104
|
+
require_paths:
|
105
|
+
- lib
|
106
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
107
|
+
none: false
|
108
|
+
requirements:
|
109
|
+
- - ! '>='
|
110
|
+
- !ruby/object:Gem::Version
|
111
|
+
version: '0'
|
112
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
113
|
+
none: false
|
114
|
+
requirements:
|
115
|
+
- - ! '>='
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: '0'
|
118
|
+
requirements: []
|
119
|
+
rubyforge_project:
|
120
|
+
rubygems_version: 1.8.23
|
121
|
+
signing_key:
|
122
|
+
specification_version: 3
|
123
|
+
summary: Make Hadoop and ElasticSearch play together nicely.
|
124
|
+
test_files:
|
125
|
+
- spec/spec_helper.rb
|
126
|
+
- spec/support/driver_helper.rb
|
127
|
+
- spec/support/integration_helper.rb
|
128
|
+
- spec/wonderdog/hadoop_invocation_override_spec.rb
|
129
|
+
- spec/wonderdog/index_and_type_spec.rb
|
130
|
+
has_rdoc:
|