wonderdog 0.0.2 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore CHANGED
@@ -1,16 +1,11 @@
1
- \#*
2
- .\#*
3
- *~
4
- .DS_Store
5
- Icon?
6
- REVISION
7
- TAGS*
8
- nohup.out
9
- .bzr
10
- .hg
11
- .svn
12
1
 
13
- a.out
2
+
3
+
4
+
5
+
6
+
7
+
8
+ *.log
14
9
  *.o
15
10
  *.pyc
16
11
  *.so
@@ -18,32 +13,39 @@ a.out
18
13
  *.sw?
19
14
  *.tmproj
20
15
  *_flymake.*
16
+ *private*
17
+ *~
18
+ .DS_Store
19
+ .\#*
20
+ .bzr
21
+ .hg
21
22
  .project
22
23
  .pydevproject
23
24
  .settings
25
+ .svn
24
26
  .tasks-cache
25
27
  .yardoc
26
-
27
- *.log
28
-
29
- *private*
30
- /log/*
31
- /pkg/*
32
- /tmp/*
33
- /coverage
34
-
35
- /db/*.sqlite3
28
+ /Gemfile.lock
29
+ /config/apeyeye.yaml
36
30
  /config/database.yml
37
31
  /config/private.yml
32
+ /config/routes.rb
38
33
  /config/settings.yml
39
34
  /config/sphinx.yml
35
+ /coverage
36
+ /db/*.sqlite3
37
+ /log/*
38
+ /pkg/*
40
39
  /public/stylesheets/compiled/*
41
-
42
- /webrat.log
40
+ /target
41
+ /tmp/*
43
42
  /vendor/webrat/vendor
44
-
43
+ /webrat.log
44
+ Gemfile.lock
45
+ Icon?
46
+ REVISION
47
+ TAGS*
48
+ \#*
49
+ a.out
45
50
  doc
46
-
47
- /config/apeyeye.yaml
48
- /config/routes.rb
49
- /target
51
+ nohup.out
@@ -0,0 +1,6 @@
1
+ --readme README.md
2
+ --markup markdown
3
+ -
4
+ CHANGELOG.md
5
+ LICENSE.md
6
+ README.md
data/Gemfile CHANGED
@@ -5,4 +5,6 @@ gemspec
5
5
  group :development do
6
6
  gem 'rake', '~> 0.9'
7
7
  gem 'rspec', '~> 2'
8
+ gem 'yard'
9
+ gem 'redcarpet'
8
10
  end
@@ -0,0 +1,2 @@
1
+ Wukong.dataflow(:mapper) { identity }
2
+ Wukong.dataflow(:reducer) { identity }
@@ -0,0 +1,4 @@
1
+ require 'wonderdog'
2
+ Wukong.dataflow(:mapper) { identity }
3
+ Wukong.dataflow(:reducer) { identity }
4
+
@@ -6,9 +6,37 @@ module Wukong
6
6
  # Wukong. This module adds some overrides which enables the
7
7
  # <tt>wu-hadoop</tt> program to leverage this code.
8
8
  module Elasticsearch
9
+ include Plugin
10
+
11
+ # Configure the given `settings` to be able to work with
12
+ # Elasticsearch.
13
+ #
14
+ # @param [Configliere::Param] settings
15
+ # @return [Configliere::Param] the newly configured settings
16
+ def self.configure settings, program
17
+ return unless program == 'wu-hadoop'
18
+ settings.define(:es_tmp_dir, :description => "Temporary directory on the HDFS to store job files while reading/writing to ElasticSearch", :default => "/user/#{ENV['USER']}/wukong", :wukong_hadoop => true)
19
+ settings.define(:es_lib_dir, :description => "Directory containing Elasticsearch, Wonderdog, and other support jars", :default => "/usr/lib/hadoop/lib", :wukong_hadoop => true)
20
+ settings.define(:es_config, :description => "Where to find configuration files detailing how to join an ElasticSearch cluster", :wukong_hadoop => true)
21
+ settings.define(:es_input_splits, :description => "Number of input splits to target when reading from ElasticSearch", :type => Integer, :wukong_hadoop => true)
22
+ settings.define(:es_request_size, :description => "Number of objects requested during each batch read from ElasticSearch", :type => Integer, :wukong_hadoop => true)
23
+ settings.define(:es_scroll_timeout, :description => "Amount of time to wait on a scroll", :wukong_hadoop => true)
24
+ settings.define(:es_index_field, :description => "Field to use from each record to override the default index", :wukong_hadoop => true)
25
+ settings.define(:es_mapping_field, :description => "Field to use from each record to override the default mapping", :wukong_hadoop => true)
26
+ settings.define(:es_id_field, :description => "If this field is present in a record, make an update request, otherwise make a create request", :wukong_hadoop => true)
27
+ settings.define(:es_bulk_size, :description => "Number of requests to batch locally before making a request to ElasticSearch", :type => Integer, :wukong_hadoop => true)
28
+ settings.define(:es_query, :description => "Query to use when defining input splits for ElasticSearch input", :wukong_hadoop => true)
29
+ end
30
+
31
+ # Boot Wonderdog with the given `settings` in the given `dir`.
32
+ #
33
+ # @param [Configliere::Param] settings
34
+ # @param [String] root
35
+ def self.boot settings, root
36
+ end
37
+
9
38
  end
10
39
  end
11
40
 
12
- require 'wonderdog/configuration'
13
41
  require 'wonderdog/hadoop_invocation_override'
14
42
  require 'wonderdog/timestamp'
@@ -99,8 +99,8 @@ module Wukong
99
99
  # @return [Array<String>]
100
100
  def hadoop_jobconf_options
101
101
  if reads_from_elasticsearch? || writes_to_elasticsearch?
102
- settings[:map_speculative] = false if settings[:map_speculative].nil?
103
- settings[:reduce_speculative] = false if settings[:reduce_speculative].nil?
102
+ settings[:map_speculative] = 'false' if settings[:map_speculative].nil?
103
+ settings[:reduce_speculative] = 'false' if settings[:reduce_speculative].nil?
104
104
  end
105
105
 
106
106
  super() + [].tap do |o|
@@ -164,5 +164,5 @@ module Wukong
164
164
  end
165
165
  end
166
166
 
167
- Hadoop::Driver.class_eval { include Elasticsearch::HadoopInvocationOverride }
167
+ Hadoop::HadoopRunner.class_eval { include Elasticsearch::HadoopInvocationOverride }
168
168
  end
@@ -26,7 +26,7 @@ module Wukong
26
26
  # @param [String]
27
27
  attr_reader :mapping
28
28
 
29
- # Does the given +string+ look like a possible Elasticsearch
29
+ # Does the given `string` look like a possible Elasticsearch
30
30
  # /index/mapping specification?
31
31
  #
32
32
  # @param [String] string
@@ -1,4 +1,4 @@
1
1
  module Wonderdog
2
2
  # The currently running Wonderdog version
3
- VERSION = '0.0.2'
3
+ VERSION = '0.1.0'
4
4
  end
@@ -1,12 +1,10 @@
1
1
  require 'wonderdog'
2
2
  require 'wukong/spec_helpers'
3
- require_relative('support/integration_helper')
4
- require_relative('support/driver_helper')
5
-
6
3
 
7
4
  RSpec.configure do |config|
8
5
 
9
6
  config.before(:each) do
7
+ Wukong::Log.level = Log4r::OFF
10
8
  @orig_reg = Wukong.registry.show
11
9
  end
12
10
 
@@ -14,9 +12,18 @@ RSpec.configure do |config|
14
12
  Wukong.registry.clear!
15
13
  Wukong.registry.merge!(@orig_reg)
16
14
  end
17
-
15
+
18
16
  include Wukong::SpecHelpers
19
- include Wukong::Elasticsearch::IntegrationHelper
20
- include Wukong::Elasticsearch::DriverHelper
21
- end
17
+
18
+ def root
19
+ @root ||= Pathname.new(File.expand_path('../..', __FILE__))
20
+ end
22
21
 
22
+ def hadoop_runner *args, &block
23
+ runner(Wukong::Hadoop::HadoopRunner, 'wu-hadoop', *args) do
24
+ stub!(:execute_command!)
25
+ instance_eval(&block) if block_given?
26
+ end
27
+ end
28
+
29
+ end
File without changes
@@ -2,10 +2,10 @@ require 'spec_helper'
2
2
 
3
3
  describe Wukong::Elasticsearch::HadoopInvocationOverride do
4
4
 
5
- let(:no_es) { driver('regexp', 'count', input: '/tmp/input_file', output: '/tmp/output_file') }
6
- let(:es_reader) { driver('regexp', 'count', input: 'es://the_index/the_map', output: '/tmp/output_file') }
7
- let(:es_writer) { driver('regexp', 'count', input: '/tmp/input_file', output: 'es:///the_index/the_map') }
8
- let(:es_complex) { driver('regexp', 'count', input: 'es://the_index/the_map', output: 'es:///the_index/the_map', es_query: '{"hi": "there"}', es_request_size: 1000, es_index_field: 'ID') }
5
+ let(:no_es) { hadoop_runner('regexp', 'count', input: '/tmp/input_file', output: '/tmp/output_file') }
6
+ let(:es_reader) { hadoop_runner('regexp', 'count', input: 'es://the_index/the_map', output: '/tmp/output_file') }
7
+ let(:es_writer) { hadoop_runner('regexp', 'count', input: '/tmp/input_file', output: 'es:///the_index/the_map') }
8
+ let(:es_complex) { hadoop_runner('regexp', 'count', input: 'es://the_index/the_map', output: 'es:///the_index/the_map', es_query: '{"hi": "there"}', es_request_size: 1000, es_index_field: 'ID', map_speculative: true, reduce_speculative: true) }
9
9
 
10
10
  context "passing necessary jars to Hadoop streaming" do
11
11
  before { Dir.stub!(:[]).and_return(["/lib/dir/elasticsearch.jar"], ["/lib/dir/wonderdog.jar"]) }
@@ -36,16 +36,32 @@ describe Wukong::Elasticsearch::HadoopInvocationOverride do
36
36
  context "setting speculative execution" do
37
37
  context "when not given speculative options" do
38
38
  context "and not interacting with Elasticsearch" do
39
- it "doesn't add jars" do
39
+ it "doesn't add any speculative options" do
40
40
  no_es.hadoop_commandline.should_not match('speculative')
41
41
  end
42
42
  end
43
43
  context "and reading from Elasticsearch" do
44
- it "adds default jars it finds on the local filesystem" do
45
- es_reader.hadoop_commandline.should match('-mapred.map.tasks.speculative.execution.*false')
46
- es_reader.hadoop_commandline.should match('-mapred.reduce.tasks.speculative.execution.*false')
44
+ it "disables speculative execution in the mapper" do
45
+ es_reader.hadoop_commandline.should match(/-D mapred.map.tasks.speculative.execution.*false/)
46
+ end
47
+ it "disables speculative execution in the reducer" do
48
+ es_reader.hadoop_commandline.should match(/-D mapred.reduce.tasks.speculative.execution.*false/)
47
49
  end
48
50
  end
51
+ context "and reading from Elasticsearch" do
52
+ it "disables speculative execution in the mapper" do
53
+ es_writer.hadoop_commandline.should match(/-D mapred.map.tasks.speculative.execution.*false/)
54
+ end
55
+ it "disables speculative execution in the reducer" do
56
+ es_writer.hadoop_commandline.should match(/-D mapred.reduce.tasks.speculative.execution.*false/)
57
+ end
58
+ end
59
+ end
60
+ context "when given speculative options" do
61
+ it "does not change them" do
62
+ es_complex.hadoop_commandline.should match(/-D mapred.map.tasks.speculative.execution.*true/)
63
+ es_complex.hadoop_commandline.should match(/-D mapred.reduce.tasks.speculative.execution.*true/)
64
+ end
49
65
  end
50
66
  end
51
67
 
@@ -0,0 +1,18 @@
1
+ require 'spec_helper'
2
+
3
+ describe 'wu-hadoop' do
4
+
5
+ context "when wonderdog hasn't been required" do
6
+ let(:script) { examples_dir('no_wonderdog.rb') }
7
+ it "doesn't recognize Elasticsearch URIs" do
8
+ command('wu-hadoop', script, '--input=es://foo/bar', '--output=/some/path', '--dry_run').should_not have_stdout('elasticsearch')
9
+ end
10
+ end
11
+
12
+ context "when wonderdog hasn't been required" do
13
+ let(:script) { examples_dir('wonderdog.rb') }
14
+ it "recognizes Elasticsearch URIs" do
15
+ command('wu-hadoop', script, '--input=es://foo/bar', '--output=/some/path', '--dry_run').should have_stdout('elasticsearch')
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,5 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wukong::Elasticsearch do
4
+ it_behaves_like 'a plugin'
5
+ end
@@ -28,6 +28,5 @@ EOF
28
28
  gem.test_files = gem.files.grep(/^spec/)
29
29
  gem.require_paths = ['lib']
30
30
 
31
- gem.add_dependency('wukong', '3.0.0.pre3')
32
- gem.add_dependency('wukong-hadoop', '>= 0.0.2')
31
+ gem.add_dependency('wukong-hadoop', '0.1.0')
33
32
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wonderdog
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.1.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -13,16 +13,16 @@ authors:
13
13
  autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
- date: 2012-12-17 00:00:00.000000000 Z
16
+ date: 2013-02-20 00:00:00.000000000 Z
17
17
  dependencies:
18
18
  - !ruby/object:Gem::Dependency
19
- name: wukong
19
+ name: wukong-hadoop
20
20
  requirement: !ruby/object:Gem::Requirement
21
21
  none: false
22
22
  requirements:
23
23
  - - '='
24
24
  - !ruby/object:Gem::Version
25
- version: 3.0.0.pre3
25
+ version: 0.1.0
26
26
  type: :runtime
27
27
  prerelease: false
28
28
  version_requirements: !ruby/object:Gem::Requirement
@@ -30,23 +30,7 @@ dependencies:
30
30
  requirements:
31
31
  - - '='
32
32
  - !ruby/object:Gem::Version
33
- version: 3.0.0.pre3
34
- - !ruby/object:Gem::Dependency
35
- name: wukong-hadoop
36
- requirement: !ruby/object:Gem::Requirement
37
- none: false
38
- requirements:
39
- - - ! '>='
40
- - !ruby/object:Gem::Version
41
- version: 0.0.2
42
- type: :runtime
43
- prerelease: false
44
- version_requirements: !ruby/object:Gem::Requirement
45
- none: false
46
- requirements:
47
- - - ! '>='
48
- - !ruby/object:Gem::Version
49
- version: 0.0.2
33
+ version: 0.1.0
50
34
  description: ! " Wonderdog provides code in both Ruby and Java to make Elasticsearch\n
51
35
  \ a more fully-fledged member of both the Hadoop and Wukong\n ecosystems.\n\n For
52
36
  the Java side, Wonderdog provides InputFormat and OutputFormat\n classes for use
@@ -60,9 +44,9 @@ extra_rdoc_files: []
60
44
  files:
61
45
  - .gitignore
62
46
  - .rspec
47
+ - .yardopts
63
48
  - CHANGELOG.md
64
49
  - Gemfile
65
- - Gemfile.lock
66
50
  - LICENSE.md
67
51
  - README.md
68
52
  - Rakefile
@@ -75,8 +59,9 @@ files:
75
59
  - config/more_settings.yml
76
60
  - config/run_elasticsearch-2.sh
77
61
  - config/ufo_config.json
62
+ - examples/no_wonderdog.rb
63
+ - examples/wonderdog.rb
78
64
  - lib/wonderdog.rb
79
- - lib/wonderdog/configuration.rb
80
65
  - lib/wonderdog/hadoop_invocation_override.rb
81
66
  - lib/wonderdog/index_and_mapping.rb
82
67
  - lib/wonderdog/timestamp.rb
@@ -89,10 +74,11 @@ files:
89
74
  - notes/pigstorefunc.pig
90
75
  - pom.xml
91
76
  - spec/spec_helper.rb
92
- - spec/support/driver_helper.rb
93
- - spec/support/integration_helper.rb
77
+ - spec/support/.gitkeep
94
78
  - spec/wonderdog/hadoop_invocation_override_spec.rb
95
79
  - spec/wonderdog/index_and_type_spec.rb
80
+ - spec/wonderdog/wu-hadoop_spec.rb
81
+ - spec/wonderdog_spec.rb
96
82
  - src/main/java/com/infochimps/elasticsearch/ElasticSearchInputFormat.java
97
83
  - src/main/java/com/infochimps/elasticsearch/ElasticSearchOutputCommitter.java
98
84
  - src/main/java/com/infochimps/elasticsearch/ElasticSearchOutputFormat.java
@@ -141,8 +127,9 @@ specification_version: 3
141
127
  summary: Make Hadoop and ElasticSearch play together nicely.
142
128
  test_files:
143
129
  - spec/spec_helper.rb
144
- - spec/support/driver_helper.rb
145
- - spec/support/integration_helper.rb
130
+ - spec/support/.gitkeep
146
131
  - spec/wonderdog/hadoop_invocation_override_spec.rb
147
132
  - spec/wonderdog/index_and_type_spec.rb
133
+ - spec/wonderdog/wu-hadoop_spec.rb
134
+ - spec/wonderdog_spec.rb
148
135
  has_rdoc:
@@ -1,57 +0,0 @@
1
- PATH
2
- remote: .
3
- specs:
4
- wonderdog (0.0.1)
5
- wukong (= 3.0.0.pre3)
6
- wukong-hadoop (>= 0.0.2)
7
-
8
- GEM
9
- remote: http://rubygems.org/
10
- specs:
11
- configliere (0.4.18)
12
- highline (>= 1.5.2)
13
- multi_json (>= 1.1)
14
- diff-lcs (1.1.3)
15
- eventmachine (1.0.0)
16
- forgery (0.5.0)
17
- gorillib (0.4.2)
18
- configliere (>= 0.4.13)
19
- json
20
- multi_json (>= 1.1)
21
- highline (1.6.15)
22
- json (1.7.5)
23
- log4r (1.1.10)
24
- multi_json (1.5.0)
25
- rake (0.9.6)
26
- rspec (2.12.0)
27
- rspec-core (~> 2.12.0)
28
- rspec-expectations (~> 2.12.0)
29
- rspec-mocks (~> 2.12.0)
30
- rspec-core (2.12.2)
31
- rspec-expectations (2.12.1)
32
- diff-lcs (~> 1.1.3)
33
- rspec-mocks (2.12.0)
34
- uuidtools (2.1.3)
35
- vayacondios-client (0.1.2)
36
- configliere (>= 0.4.16)
37
- gorillib (~> 0.4.2)
38
- multi_json (~> 1.1)
39
- wukong (3.0.0.pre3)
40
- configliere (>= 0.4.18)
41
- eventmachine
42
- forgery
43
- gorillib (>= 0.4.2)
44
- log4r
45
- multi_json (>= 1.3.6)
46
- uuidtools
47
- vayacondios-client (>= 0.1.2)
48
- wukong-hadoop (0.0.2)
49
- wukong (= 3.0.0.pre3)
50
-
51
- PLATFORMS
52
- ruby
53
-
54
- DEPENDENCIES
55
- rake (~> 0.9)
56
- rspec (~> 2)
57
- wonderdog!
@@ -1,26 +0,0 @@
1
- module Wukong
2
- module Elasticsearch
3
-
4
- # Configure the given +settings+ to be able to work with
5
- # Elasticsearch.
6
- #
7
- # @param [Configliere::Param] settings
8
- # @return [Configliere::Param] the newly configured settings
9
- def self.configure settings
10
- settings.define(:es_tmp_dir, :description => "Temporary directory on the HDFS to store job files while reading/writing to ElasticSearch", :default => "/user/#{ENV['USER']}/wukong", :wukong_hadoop => true)
11
- settings.define(:es_lib_dir, :description => "Directory containing Elasticsearch, Wonderdog, and other support jars", :default => "/usr/lib/hadoop/lib", :wukong_hadoop => true)
12
- settings.define(:es_config, :description => "Where to find configuration files detailing how to join an ElasticSearch cluster", :wukong_hadoop => true)
13
- settings.define(:es_input_splits, :description => "Number of input splits to target when reading from ElasticSearch", :type => Integer, :wukong_hadoop => true)
14
- settings.define(:es_request_size, :description => "Number of objects requested during each batch read from ElasticSearch", :type => Integer, :wukong_hadoop => true)
15
- settings.define(:es_scroll_timeout, :description => "Amount of time to wait on a scroll", :wukong_hadoop => true)
16
- settings.define(:es_index_field, :description => "Field to use from each record to override the default index", :wukong_hadoop => true)
17
- settings.define(:es_mapping_field, :description => "Field to use from each record to override the default mapping", :wukong_hadoop => true)
18
- settings.define(:es_id_field, :description => "If this field is present in a record, make an update request, otherwise make a create request", :wukong_hadoop => true)
19
- settings.define(:es_bulk_size, :description => "Number of requests to batch locally before making a request to ElasticSearch", :type => Integer, :wukong_hadoop => true)
20
- settings.define(:es_query, :description => "Query to use when defining input splits for ElasticSearch input", :wukong_hadoop => true)
21
-
22
- settings
23
- end
24
- end
25
-
26
- end
@@ -1,15 +0,0 @@
1
- module Wukong
2
- module Elasticsearch
3
- module DriverHelper
4
-
5
- def driver *args
6
- params = Elasticsearch.configure(Hadoop.configure(Configliere::Param.new))
7
- params.resolve!
8
- params.merge!(args.pop) if args.last.is_a?(Hash)
9
- Hadoop::Driver.new(params, *args)
10
- end
11
-
12
- end
13
- end
14
- end
15
-
@@ -1,30 +0,0 @@
1
- module Wukong
2
- module Elasticsearch
3
- module IntegrationHelper
4
-
5
- def root
6
- @root ||= Pathname.new(File.expand_path('../../..', __FILE__))
7
- end
8
-
9
- def lib_dir
10
- root.join('lib')
11
- end
12
-
13
- def bin_dir
14
- root.join('bin')
15
- end
16
-
17
- def integration_env
18
- {
19
- "RUBYLIB" => [lib_dir.to_s, ENV["RUBYLIB"]].compact.join(':')
20
- }
21
- end
22
-
23
- def integration_cwd
24
- root.to_s
25
- end
26
-
27
- end
28
- end
29
- end
30
-