wonderdog 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. data/.gitignore +49 -0
  2. data/.rspec +2 -0
  3. data/CHANGELOG.md +5 -0
  4. data/LICENSE.md +201 -0
  5. data/README.md +175 -0
  6. data/Rakefile +10 -0
  7. data/bin/estool +141 -0
  8. data/bin/estrus.rb +136 -0
  9. data/bin/wonderdog +93 -0
  10. data/config/elasticsearch-example.yml +227 -0
  11. data/config/elasticsearch.in.sh +52 -0
  12. data/config/logging.yml +43 -0
  13. data/config/more_settings.yml +60 -0
  14. data/config/run_elasticsearch-2.sh +42 -0
  15. data/config/ufo_config.json +12 -0
  16. data/lib/wonderdog.rb +14 -0
  17. data/lib/wonderdog/configuration.rb +25 -0
  18. data/lib/wonderdog/hadoop_invocation_override.rb +139 -0
  19. data/lib/wonderdog/index_and_mapping.rb +67 -0
  20. data/lib/wonderdog/timestamp.rb +43 -0
  21. data/lib/wonderdog/version.rb +3 -0
  22. data/notes/README-benchmarking.txt +272 -0
  23. data/notes/README-read_tuning.textile +74 -0
  24. data/notes/benchmarking-201011.numbers +0 -0
  25. data/notes/cluster_notes.md +17 -0
  26. data/notes/notes.txt +91 -0
  27. data/notes/pigstorefunc.pig +45 -0
  28. data/pom.xml +80 -0
  29. data/spec/spec_helper.rb +22 -0
  30. data/spec/support/driver_helper.rb +15 -0
  31. data/spec/support/integration_helper.rb +30 -0
  32. data/spec/wonderdog/hadoop_invocation_override_spec.rb +81 -0
  33. data/spec/wonderdog/index_and_type_spec.rb +73 -0
  34. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchInputFormat.java +268 -0
  35. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchOutputCommitter.java +39 -0
  36. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchOutputFormat.java +283 -0
  37. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchSplit.java +60 -0
  38. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingInputFormat.java +231 -0
  39. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingOutputCommitter.java +37 -0
  40. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingOutputFormat.java +88 -0
  41. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingRecordReader.java +176 -0
  42. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingRecordWriter.java +171 -0
  43. data/src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingSplit.java +102 -0
  44. data/src/main/java/com/infochimps/elasticsearch/ElasticTest.java +108 -0
  45. data/src/main/java/com/infochimps/elasticsearch/hadoop/util/HadoopUtils.java +100 -0
  46. data/src/main/java/com/infochimps/elasticsearch/pig/ElasticSearchIndex.java +216 -0
  47. data/src/main/java/com/infochimps/elasticsearch/pig/ElasticSearchJsonIndex.java +235 -0
  48. data/src/main/java/com/infochimps/elasticsearch/pig/ElasticSearchStorage.java +355 -0
  49. data/test/foo.json +3 -0
  50. data/test/foo.tsv +3 -0
  51. data/test/test_dump.pig +19 -0
  52. data/test/test_json_loader.pig +21 -0
  53. data/test/test_tsv_loader.pig +16 -0
  54. data/wonderdog.gemspec +32 -0
  55. metadata +130 -0
@@ -0,0 +1,3 @@
1
+ {"character":"a","value":"1"}
2
+ {"character":"b","value":"2"}
3
+ {"character":"c","value":"3"}
@@ -0,0 +1,3 @@
1
+ a 1
2
+ b 2
3
+ c 3
@@ -0,0 +1,19 @@
1
+ --
2
+ -- This tests loading data from elasticsearch
3
+ --
4
+
5
+ %default ES_JAR_DIR '/usr/local/Cellar/elasticsearch/0.18.7/libexec'
6
+ %default ES_YAML '/usr/local/Cellar/elasticsearch/0.18.7/config/elasticsearch.yml'
7
+ %default PLUGINS '/usr/local/Cellar/elasticsearch/0.18.7/plugins'
8
+
9
+ %default INDEX 'foo_test'
10
+ %default OBJ 'foo'
11
+
12
+ register $ES_JAR_DIR/*.jar;
13
+ register target/wonderdog*.jar;
14
+
15
+ --
16
+ -- Will load the data as (doc_id, contents) tuples where the contents is the original json source from elasticsearch
17
+ --
18
+ foo = LOAD 'es://$INDEX/$OBJ' USING com.infochimps.elasticsearch.pig.ElasticSearchStorage('$ES_YAML', '$PLUGINS') AS (doc_id:chararray, contents:chararray);
19
+ DUMP foo;
@@ -0,0 +1,21 @@
1
+ --
2
+ -- This tests the json indexer. Run in local mode with 'pig -x local test/test_json_loader.pig'
3
+ --
4
+
5
+ %default ES_JAR_DIR '/usr/local/Cellar/elasticsearch/0.18.7/libexec'
6
+ %default ES_YAML '/usr/local/Cellar/elasticsearch/0.18.7/config/elasticsearch.yml'
7
+ %default PLUGINS '/usr/local/Cellar/elasticsearch/0.18.7/plugins'
8
+
9
+ %default INDEX 'foo_test'
10
+ %default OBJ 'foo'
11
+
12
+ register $ES_JAR_DIR/*.jar;
13
+ register target/wonderdog*.jar;
14
+
15
+ foo = LOAD 'test/foo.json' AS (data:chararray);
16
+
17
+ --
18
+ -- Query parameters let elasticsearch output format that we're storing json data and
19
+ -- want to use a bulk request size of 1 record.
20
+ --
21
+ STORE foo INTO 'es://$INDEX/$OBJ?json=true&size=1' USING com.infochimps.elasticsearch.pig.ElasticSearchStorage('$ES_YAML', '$PLUGINS');
@@ -0,0 +1,16 @@
1
+ --
2
+ -- This tests the tsv indexer. Run in local mode with 'pig -x local test/test_tsv_loader.pig'
3
+ --
4
+ %default ES_JAR_DIR '/usr/local/Cellar/elasticsearch/0.18.7/libexec'
5
+ %default ES_YAML '/usr/local/Cellar/elasticsearch/0.18.7/config/elasticsearch.yml'
6
+ %default PLUGINS '/usr/local/Cellar/elasticsearch/0.18.7/plugins'
7
+
8
+ %default INDEX 'foo_test'
9
+ %default OBJ 'foo'
10
+
11
+ register $ES_JAR_DIR/*.jar;
12
+ register target/wonderdog*.jar;
13
+
14
+ foo = LOAD 'test/foo.tsv' AS (character:chararray, value:int);
15
+
16
+ STORE foo INTO 'es://$INDEX/$OBJ?json=false&size=1' USING com.infochimps.elasticsearch.pig.ElasticSearchStorage('$ES_YAML', '$PLUGINS');
@@ -0,0 +1,32 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require File.expand_path('../lib/wonderdog/version', __FILE__)
3
+
4
+ Gem::Specification.new do |gem|
5
+ gem.name = 'wonderdog'
6
+ gem.homepage = 'https://github.com/infochimps-labs/wonderdog'
7
+ gem.licenses = ["Apache 2.0"]
8
+ gem.email = 'coders@infochimps.com'
9
+ gem.authors = ['Infochimps', 'Philip (flip) Kromer', 'Jacob Perkins', 'Travis Dempsey', 'Dhruv Bansal']
10
+ gem.version = Wonderdog::VERSION
11
+
12
+ gem.summary = 'Make Hadoop and ElasticSearch play together nicely.'
13
+ gem.description = <<-EOF
14
+ Wonderdog provides code in both Ruby and Java to make Elasticsearch
15
+ a more fully-fledged member of both the Hadoop and Wukong
16
+ ecosystems.
17
+
18
+ For the Java side, Wonderdog provides InputFormat and OutputFormat
19
+ classes for use with Hadoop (esp. Hadoop Streaming) and Pig.
20
+
21
+ For the Ruby side, Wonderdog provides extensions for wu-hadoop to
22
+ make running Hadoop Streaming jobs written in Wukong against
23
+ ElasticSearch easier.
24
+ EOF
25
+
26
+ gem.files = `git ls-files`.split("\n")
27
+ gem.executables = []
28
+ gem.test_files = gem.files.grep(/^spec/)
29
+ gem.require_paths = ['lib']
30
+
31
+ gem.add_dependency('wukong', '3.0.0.pre2')
32
+ end
metadata ADDED
@@ -0,0 +1,130 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: wonderdog
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Infochimps
9
+ - Philip (flip) Kromer
10
+ - Jacob Perkins
11
+ - Travis Dempsey
12
+ - Dhruv Bansal
13
+ autorequire:
14
+ bindir: bin
15
+ cert_chain: []
16
+ date: 2012-12-01 00:00:00.000000000 Z
17
+ dependencies:
18
+ - !ruby/object:Gem::Dependency
19
+ name: wukong
20
+ requirement: !ruby/object:Gem::Requirement
21
+ none: false
22
+ requirements:
23
+ - - '='
24
+ - !ruby/object:Gem::Version
25
+ version: 3.0.0.pre2
26
+ type: :runtime
27
+ prerelease: false
28
+ version_requirements: !ruby/object:Gem::Requirement
29
+ none: false
30
+ requirements:
31
+ - - '='
32
+ - !ruby/object:Gem::Version
33
+ version: 3.0.0.pre2
34
+ description: ! " Wonderdog provides code in both Ruby and Java to make Elasticsearch\n
35
+ \ a more fully-fledged member of both the Hadoop and Wukong\n ecosystems.\n\n For
36
+ the Java side, Wonderdog provides InputFormat and OutputFormat\n classes for use
37
+ with Hadoop (esp. Hadoop Streaming) and Pig.\n\n For the Ruby side, Wonderdog provides
38
+ extensions for wu-hadoop to\n make running Hadoop Streaming jobs written in Wukong
39
+ against\n ElasticSearch easier.\n"
40
+ email: coders@infochimps.com
41
+ executables: []
42
+ extensions: []
43
+ extra_rdoc_files: []
44
+ files:
45
+ - .gitignore
46
+ - .rspec
47
+ - CHANGELOG.md
48
+ - LICENSE.md
49
+ - README.md
50
+ - Rakefile
51
+ - bin/estool
52
+ - bin/estrus.rb
53
+ - bin/wonderdog
54
+ - config/elasticsearch-example.yml
55
+ - config/elasticsearch.in.sh
56
+ - config/logging.yml
57
+ - config/more_settings.yml
58
+ - config/run_elasticsearch-2.sh
59
+ - config/ufo_config.json
60
+ - lib/wonderdog.rb
61
+ - lib/wonderdog/configuration.rb
62
+ - lib/wonderdog/hadoop_invocation_override.rb
63
+ - lib/wonderdog/index_and_mapping.rb
64
+ - lib/wonderdog/timestamp.rb
65
+ - lib/wonderdog/version.rb
66
+ - notes/README-benchmarking.txt
67
+ - notes/README-read_tuning.textile
68
+ - notes/benchmarking-201011.numbers
69
+ - notes/cluster_notes.md
70
+ - notes/notes.txt
71
+ - notes/pigstorefunc.pig
72
+ - pom.xml
73
+ - spec/spec_helper.rb
74
+ - spec/support/driver_helper.rb
75
+ - spec/support/integration_helper.rb
76
+ - spec/wonderdog/hadoop_invocation_override_spec.rb
77
+ - spec/wonderdog/index_and_type_spec.rb
78
+ - src/main/java/com/infochimps/elasticsearch/ElasticSearchInputFormat.java
79
+ - src/main/java/com/infochimps/elasticsearch/ElasticSearchOutputCommitter.java
80
+ - src/main/java/com/infochimps/elasticsearch/ElasticSearchOutputFormat.java
81
+ - src/main/java/com/infochimps/elasticsearch/ElasticSearchSplit.java
82
+ - src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingInputFormat.java
83
+ - src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingOutputCommitter.java
84
+ - src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingOutputFormat.java
85
+ - src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingRecordReader.java
86
+ - src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingRecordWriter.java
87
+ - src/main/java/com/infochimps/elasticsearch/ElasticSearchStreamingSplit.java
88
+ - src/main/java/com/infochimps/elasticsearch/ElasticTest.java
89
+ - src/main/java/com/infochimps/elasticsearch/hadoop/util/HadoopUtils.java
90
+ - src/main/java/com/infochimps/elasticsearch/pig/ElasticSearchIndex.java
91
+ - src/main/java/com/infochimps/elasticsearch/pig/ElasticSearchJsonIndex.java
92
+ - src/main/java/com/infochimps/elasticsearch/pig/ElasticSearchStorage.java
93
+ - test/foo.json
94
+ - test/foo.tsv
95
+ - test/test_dump.pig
96
+ - test/test_json_loader.pig
97
+ - test/test_tsv_loader.pig
98
+ - wonderdog.gemspec
99
+ homepage: https://github.com/infochimps-labs/wonderdog
100
+ licenses:
101
+ - Apache 2.0
102
+ post_install_message:
103
+ rdoc_options: []
104
+ require_paths:
105
+ - lib
106
+ required_ruby_version: !ruby/object:Gem::Requirement
107
+ none: false
108
+ requirements:
109
+ - - ! '>='
110
+ - !ruby/object:Gem::Version
111
+ version: '0'
112
+ required_rubygems_version: !ruby/object:Gem::Requirement
113
+ none: false
114
+ requirements:
115
+ - - ! '>='
116
+ - !ruby/object:Gem::Version
117
+ version: '0'
118
+ requirements: []
119
+ rubyforge_project:
120
+ rubygems_version: 1.8.23
121
+ signing_key:
122
+ specification_version: 3
123
+ summary: Make Hadoop and ElasticSearch play together nicely.
124
+ test_files:
125
+ - spec/spec_helper.rb
126
+ - spec/support/driver_helper.rb
127
+ - spec/support/integration_helper.rb
128
+ - spec/wonderdog/hadoop_invocation_override_spec.rb
129
+ - spec/wonderdog/index_and_type_spec.rb
130
+ has_rdoc: