wonderdog 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (55) hide show
  1. data/.gitignore +49 -0
  2. data/.rspec +2 -0
  3. data/CHANGELOG.md +5 -0
  4. data/LICENSE.md +201 -0
  5. data/README.md +175 -0
  6. data/Rakefile +10 -0
  7. data/bin/estool +141 -0
  8. data/bin/estrus.rb +136 -0
  9. data/bin/wonderdog +93 -0
  10. data/config/elasticsearch-example.yml +227 -0
  11. data/config/elasticsearch.in.sh +52 -0
  12. data/config/logging.yml +43 -0
  13. data/config/more_settings.yml +60 -0
  14. data/config/run_elasticsearch-2.sh +42 -0
  15. data/config/ufo_config.json +12 -0
  16. data/lib/wonderdog.rb +14 -0
  17. data/lib/wonderdog/configuration.rb +25 -0
  18. data/lib/wonderdog/hadoop_invocation_override.rb +139 -0
  19. data/lib/wonderdog/index_and_mapping.rb +67 -0
  20. data/lib/wonderdog/timestamp.rb +43 -0
  21. data/lib/wonderdog/version.rb +3 -0
  22. data/notes/README-benchmarking.txt +272 -0
  23. data/notes/README-read_tuning.textile +74 -0
  24. data/notes/benchmarking-201011.numbers +0 -0
  25. data/notes/cluster_notes.md +17 -0
  26. data/notes/notes.txt +91 -0
  27. data/notes/pigstorefunc.pig +45 -0
  28. data/pom.xml +80 -0
  29. data/spec/spec_helper.rb +22 -0
  30. data/spec/support/driver_helper.rb +15 -0
  31. data/spec/support/integration_helper.rb +30 -0
  32. data/spec/wonderdog/hadoop_invocation_override_spec.rb +81 -0
  33. data/spec/wonderdog/index_and_type_spec.rb +73 -0
  34. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchInputFormat.java +268 -0
  35. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchOutputCommitter.java +39 -0
  36. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchOutputFormat.java +283 -0
  37. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchSplit.java +60 -0
  38. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingInputFormat.java +231 -0
  39. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingOutputCommitter.java +37 -0
  40. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingOutputFormat.java +88 -0
  41. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingRecordReader.java +176 -0
  42. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingRecordWriter.java +171 -0
  43. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingSplit.java +102 -0
  44. data/src/main/java/com/infochimps/elasticsearch/ElasticTest.java +108 -0
  45. data/src/main/java/com/infochimps/elasticsearch/hadoop/util/HadoopUtils.java +100 -0
  46. data/src/main/java/com/infochimps/elasticsearch/pig/ElasticSearchIndex.java +216 -0
  47. data/src/main/java/com/infochimps/elasticsearch/pig/ElasticSearchJsonIndex.java +235 -0
  48. data/src/main/java/com/infochimps/elasticsearch/pig/ElasticSearchStorage.java +355 -0
  49. data/test/foo.json +3 -0
  50. data/test/foo.tsv +3 -0
  51. data/test/test_dump.pig +19 -0
  52. data/test/test_json_loader.pig +21 -0
  53. data/test/test_tsv_loader.pig +16 -0
  54. data/wonderdog.gemspec +32 -0
  55. metadata +130 -0
@@ -0,0 +1,3 @@
1
+ {"character":"a","value":"1"}
2
+ {"character":"b","value":"2"}
3
+ {"character":"c","value":"3"}
@@ -0,0 +1,3 @@
1
+ a 1
2
+ b 2
3
+ c 3
@@ -0,0 +1,19 @@
1
+ --
2
+ -- This tests loading data from elasticsearch
3
+ --
4
+
5
+ %default ES_JAR_DIR '/usr/local/Cellar/elasticsearch/0.18.7/libexec'
6
+ %default ES_YAML '/usr/local/Cellar/elasticsearch/0.18.7/config/elasticsearch.yml'
7
+ %default PLUGINS '/usr/local/Cellar/elasticsearch/0.18.7/plugins'
8
+
9
+ %default INDEX 'foo_test'
10
+ %default OBJ 'foo'
11
+
12
+ register $ES_JAR_DIR/*.jar;
13
+ register target/wonderdog*.jar;
14
+
15
+ --
16
+ -- Will load the data as (doc_id, contents) tuples where the contents is the original json source from elasticsearch
17
+ --
18
+ foo = LOAD 'es://$INDEX/$OBJ' USING com.infochimps.elasticsearch.pig.ElasticSearchStorage('$ES_YAML', '$PLUGINS') AS (doc_id:chararray, contents:chararray);
19
+ DUMP foo;
@@ -0,0 +1,21 @@
1
+ --
2
+ -- This tests the json indexer. Run in local mode with 'pig -x local test/test_json_loader.pig'
3
+ --
4
+
5
+ %default ES_JAR_DIR '/usr/local/Cellar/elasticsearch/0.18.7/libexec'
6
+ %default ES_YAML '/usr/local/Cellar/elasticsearch/0.18.7/config/elasticsearch.yml'
7
+ %default PLUGINS '/usr/local/Cellar/elasticsearch/0.18.7/plugins'
8
+
9
+ %default INDEX 'foo_test'
10
+ %default OBJ 'foo'
11
+
12
+ register $ES_JAR_DIR/*.jar;
13
+ register target/wonderdog*.jar;
14
+
15
+ foo = LOAD 'test/foo.json' AS (data:chararray);
16
+
17
+ --
18
+ -- Query parameters let elasticsearch output format that we're storing json data and
19
+ -- want to use a bulk request size of 1 record.
20
+ --
21
+ STORE foo INTO 'es://$INDEX/$OBJ?json=true&size=1' USING com.infochimps.elasticsearch.pig.ElasticSearchStorage('$ES_YAML', '$PLUGINS');
@@ -0,0 +1,16 @@
1
+ --
2
+ -- This tests the tsv indexer. Run in local mode with 'pig -x local test/test_tsv_loader.pig'
3
+ --
4
+ %default ES_JAR_DIR '/usr/local/Cellar/elasticsearch/0.18.7/libexec'
5
+ %default ES_YAML '/usr/local/Cellar/elasticsearch/0.18.7/config/elasticsearch.yml'
6
+ %default PLUGINS '/usr/local/Cellar/elasticsearch/0.18.7/plugins'
7
+
8
+ %default INDEX 'foo_test'
9
+ %default OBJ 'foo'
10
+
11
+ register $ES_JAR_DIR/*.jar;
12
+ register target/wonderdog*.jar;
13
+
14
+ foo = LOAD 'test/foo.tsv' AS (character:chararray, value:int);
15
+
16
+ STORE foo INTO 'es://$INDEX/$OBJ?json=false&size=1' USING com.infochimps.elasticsearch.pig.ElasticSearchStorage('$ES_YAML', '$PLUGINS');
@@ -0,0 +1,32 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require File.expand_path('../lib/wonderdog/version', __FILE__)
3
+
4
+ Gem::Specification.new do |gem|
5
+ gem.name = 'wonderdog'
6
+ gem.homepage = 'https://github.com/infochimps-labs/wonderdog'
7
+ gem.licenses = ["Apache 2.0"]
8
+ gem.email = 'coders@infochimps.com'
9
+ gem.authors = ['Infochimps', 'Philip (flip) Kromer', 'Jacob Perkins', 'Travis Dempsey', 'Dhruv Bansal']
10
+ gem.version = Wonderdog::VERSION
11
+
12
+ gem.summary = 'Make Hadoop and ElasticSearch play together nicely.'
13
+ gem.description = <<-EOF
14
+ Wonderdog provides code in both Ruby and Java to make Elasticsearch
15
+ a more fully-fledged member of both the Hadoop and Wukong
16
+ ecosystems.
17
+
18
+ For the Java side, Wonderdog provides InputFormat and OutputFormat
19
+ classes for use with Hadoop (esp. Hadoop Streaming) and Pig.
20
+
21
+ For the Ruby side, Wonderdog provides extensions for wu-hadoop to
22
+ make running Hadoop Streaming jobs written in Wukong against
23
+ ElasticSearch easier.
24
+ EOF
25
+
26
+ gem.files = `git ls-files`.split("\n")
27
+ gem.executables = []
28
+ gem.test_files = gem.files.grep(/^spec/)
29
+ gem.require_paths = ['lib']
30
+
31
+ gem.add_dependency('wukong', '3.0.0.pre2')
32
+ end
metadata ADDED
@@ -0,0 +1,130 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: wonderdog
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Infochimps
9
+ - Philip (flip) Kromer
10
+ - Jacob Perkins
11
+ - Travis Dempsey
12
+ - Dhruv Bansal
13
+ autorequire:
14
+ bindir: bin
15
+ cert_chain: []
16
+ date: 2012-12-01 00:00:00.000000000 Z
17
+ dependencies:
18
+ - !ruby/object:Gem::Dependency
19
+ name: wukong
20
+ requirement: !ruby/object:Gem::Requirement
21
+ none: false
22
+ requirements:
23
+ - - '='
24
+ - !ruby/object:Gem::Version
25
+ version: 3.0.0.pre2
26
+ type: :runtime
27
+ prerelease: false
28
+ version_requirements: !ruby/object:Gem::Requirement
29
+ none: false
30
+ requirements:
31
+ - - '='
32
+ - !ruby/object:Gem::Version
33
+ version: 3.0.0.pre2
34
+ description: ! " Wonderdog provides code in both Ruby and Java to make Elasticsearch\n
35
+ \ a more fully-fledged member of both the Hadoop and Wukong\n ecosystems.\n\n For
36
+ the Java side, Wonderdog provides InputFormat and OutputFormat\n classes for use
37
+ with Hadoop (esp. Hadoop Streaming) and Pig.\n\n For the Ruby side, Wonderdog provides
38
+ extensions for wu-hadoop to\n make running Hadoop Streaming jobs written in Wukong
39
+ against\n ElasticSearch easier.\n"
40
+ email: coders@infochimps.com
41
+ executables: []
42
+ extensions: []
43
+ extra_rdoc_files: []
44
+ files:
45
+ - .gitignore
46
+ - .rspec
47
+ - CHANGELOG.md
48
+ - LICENSE.md
49
+ - README.md
50
+ - Rakefile
51
+ - bin/estool
52
+ - bin/estrus.rb
53
+ - bin/wonderdog
54
+ - config/elasticsearch-example.yml
55
+ - config/elasticsearch.in.sh
56
+ - config/logging.yml
57
+ - config/more_settings.yml
58
+ - config/run_elasticsearch-2.sh
59
+ - config/ufo_config.json
60
+ - lib/wonderdog.rb
61
+ - lib/wonderdog/configuration.rb
62
+ - lib/wonderdog/hadoop_invocation_override.rb
63
+ - lib/wonderdog/index_and_mapping.rb
64
+ - lib/wonderdog/timestamp.rb
65
+ - lib/wonderdog/version.rb
66
+ - notes/README-benchmarking.txt
67
+ - notes/README-read_tuning.textile
68
+ - notes/benchmarking-201011.numbers
69
+ - notes/cluster_notes.md
70
+ - notes/notes.txt
71
+ - notes/pigstorefunc.pig
72
+ - pom.xml
73
+ - spec/spec_helper.rb
74
+ - spec/support/driver_helper.rb
75
+ - spec/support/integration_helper.rb
76
+ - spec/wonderdog/hadoop_invocation_override_spec.rb
77
+ - spec/wonderdog/index_and_type_spec.rb
78
+ - src/main/java/com/infochimps/elasticsearch/ElasticSearchInputFormat.java
79
+ - src/main/java/com/infochimps/elasticsearch/ElasticSearchOutputCommitter.java
80
+ - src/main/java/com/infochimps/elasticsearch/ElasticSearchOutputFormat.java
81
+ - src/main/java/com/infochimps/elasticsearch/ElasticSearchSplit.java
82
+ - src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingInputFormat.java
83
+ - src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingOutputCommitter.java
84
+ - src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingOutputFormat.java
85
+ - src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingRecordReader.java
86
+ - src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingRecordWriter.java
87
+ - src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingSplit.java
88
+ - src/main/java/com/infochimps/elasticsearch/ElasticTest.java
89
+ - src/main/java/com/infochimps/elasticsearch/hadoop/util/HadoopUtils.java
90
+ - src/main/java/com/infochimps/elasticsearch/pig/ElasticSearchIndex.java
91
+ - src/main/java/com/infochimps/elasticsearch/pig/ElasticSearchJsonIndex.java
92
+ - src/main/java/com/infochimps/elasticsearch/pig/ElasticSearchStorage.java
93
+ - test/foo.json
94
+ - test/foo.tsv
95
+ - test/test_dump.pig
96
+ - test/test_json_loader.pig
97
+ - test/test_tsv_loader.pig
98
+ - wonderdog.gemspec
99
+ homepage: https://github.com/infochimps-labs/wonderdog
100
+ licenses:
101
+ - Apache 2.0
102
+ post_install_message:
103
+ rdoc_options: []
104
+ require_paths:
105
+ - lib
106
+ required_ruby_version: !ruby/object:Gem::Requirement
107
+ none: false
108
+ requirements:
109
+ - - ! '>='
110
+ - !ruby/object:Gem::Version
111
+ version: '0'
112
+ required_rubygems_version: !ruby/object:Gem::Requirement
113
+ none: false
114
+ requirements:
115
+ - - ! '>='
116
+ - !ruby/object:Gem::Version
117
+ version: '0'
118
+ requirements: []
119
+ rubyforge_project:
120
+ rubygems_version: 1.8.23
121
+ signing_key:
122
+ specification_version: 3
123
+ summary: Make Hadoop and ElasticSearch play together nicely.
124
+ test_files:
125
+ - spec/spec_helper.rb
126
+ - spec/support/driver_helper.rb
127
+ - spec/support/integration_helper.rb
128
+ - spec/wonderdog/hadoop_invocation_override_spec.rb
129
+ - spec/wonderdog/index_and_type_spec.rb
130
+ has_rdoc: