wonderdog 0.0.2 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore CHANGED
@@ -1,16 +1,11 @@
1
- \#*
2
- .\#*
3
- *~
4
- .DS_Store
5
- Icon?
6
- REVISION
7
- TAGS*
8
- nohup.out
9
- .bzr
10
- .hg
11
- .svn
12
1
 
13
- a.out
2
+
3
+
4
+
5
+
6
+
7
+
8
+ *.log
14
9
  *.o
15
10
  *.pyc
16
11
  *.so
@@ -18,32 +13,39 @@ a.out
18
13
  *.sw?
19
14
  *.tmproj
20
15
  *_flymake.*
16
+ *private*
17
+ *~
18
+ .DS_Store
19
+ .\#*
20
+ .bzr
21
+ .hg
21
22
  .project
22
23
  .pydevproject
23
24
  .settings
25
+ .svn
24
26
  .tasks-cache
25
27
  .yardoc
26
-
27
- *.log
28
-
29
- *private*
30
- /log/*
31
- /pkg/*
32
- /tmp/*
33
- /coverage
34
-
35
- /db/*.sqlite3
28
+ /Gemfile.lock
29
+ /config/apeyeye.yaml
36
30
  /config/database.yml
37
31
  /config/private.yml
32
+ /config/routes.rb
38
33
  /config/settings.yml
39
34
  /config/sphinx.yml
35
+ /coverage
36
+ /db/*.sqlite3
37
+ /log/*
38
+ /pkg/*
40
39
  /public/stylesheets/compiled/*
41
-
42
- /webrat.log
40
+ /target
41
+ /tmp/*
43
42
  /vendor/webrat/vendor
44
-
43
+ /webrat.log
44
+ Gemfile.lock
45
+ Icon?
46
+ REVISION
47
+ TAGS*
48
+ \#*
49
+ a.out
45
50
  doc
46
-
47
- /config/apeyeye.yaml
48
- /config/routes.rb
49
- /target
51
+ nohup.out
@@ -0,0 +1,6 @@
1
+ --readme README.md
2
+ --markup markdown
3
+ -
4
+ CHANGELOG.md
5
+ LICENSE.md
6
+ README.md
data/Gemfile CHANGED
@@ -5,4 +5,6 @@ gemspec
5
5
  group :development do
6
6
  gem 'rake', '~> 0.9'
7
7
  gem 'rspec', '~> 2'
8
+ gem 'yard'
9
+ gem 'redcarpet'
8
10
  end
@@ -0,0 +1,2 @@
1
+ Wukong.dataflow(:mapper) { identity }
2
+ Wukong.dataflow(:reducer) { identity }
@@ -0,0 +1,4 @@
1
+ require 'wonderdog'
2
+ Wukong.dataflow(:mapper) { identity }
3
+ Wukong.dataflow(:reducer) { identity }
4
+
@@ -6,9 +6,37 @@ module Wukong
6
6
  # Wukong. This module adds some overrides which enables the
7
7
  # <tt>wu-hadoop</tt> program to leverage this code.
8
8
  module Elasticsearch
9
+ include Plugin
10
+
11
+ # Configure the given `settings` to be able to work with
12
+ # Elasticsearch.
13
+ #
14
+ # @param [Configliere::Param] settings
15
+ # @return [Configliere::Param] the newly configured settings
16
+ def self.configure settings, program
17
+ return unless program == 'wu-hadoop'
18
+ settings.define(:es_tmp_dir, :description => "Temporary directory on the HDFS to store job files while reading/writing to ElasticSearch", :default => "/user/#{ENV['USER']}/wukong", :wukong_hadoop => true)
19
+ settings.define(:es_lib_dir, :description => "Directory containing Elasticsearch, Wonderdog, and other support jars", :default => "/usr/lib/hadoop/lib", :wukong_hadoop => true)
20
+ settings.define(:es_config, :description => "Where to find configuration files detailing how to join an ElasticSearch cluster", :wukong_hadoop => true)
21
+ settings.define(:es_input_splits, :description => "Number of input splits to target when reading from ElasticSearch", :type => Integer, :wukong_hadoop => true)
22
+ settings.define(:es_request_size, :description => "Number of objects requested during each batch read from ElasticSearch", :type => Integer, :wukong_hadoop => true)
23
+ settings.define(:es_scroll_timeout, :description => "Amount of time to wait on a scroll", :wukong_hadoop => true)
24
+ settings.define(:es_index_field, :description => "Field to use from each record to override the default index", :wukong_hadoop => true)
25
+ settings.define(:es_mapping_field, :description => "Field to use from each record to override the default mapping", :wukong_hadoop => true)
26
+ settings.define(:es_id_field, :description => "If this field is present in a record, make an update request, otherwise make a create request", :wukong_hadoop => true)
27
+ settings.define(:es_bulk_size, :description => "Number of requests to batch locally before making a request to ElasticSearch", :type => Integer, :wukong_hadoop => true)
28
+ settings.define(:es_query, :description => "Query to use when defining input splits for ElasticSearch input", :wukong_hadoop => true)
29
+ end
30
+
31
+ # Boot Wonderdog with the given `settings` in the given `dir`.
32
+ #
33
+ # @param [Configliere::Param] settings
34
+ # @param [String] root
35
+ def self.boot settings, root
36
+ end
37
+
9
38
  end
10
39
  end
11
40
 
12
- require 'wonderdog/configuration'
13
41
  require 'wonderdog/hadoop_invocation_override'
14
42
  require 'wonderdog/timestamp'
@@ -99,8 +99,8 @@ module Wukong
99
99
  # @return [Array<String>]
100
100
  def hadoop_jobconf_options
101
101
  if reads_from_elasticsearch? || writes_to_elasticsearch?
102
- settings[:map_speculative] = false if settings[:map_speculative].nil?
103
- settings[:reduce_speculative] = false if settings[:reduce_speculative].nil?
102
+ settings[:map_speculative] = 'false' if settings[:map_speculative].nil?
103
+ settings[:reduce_speculative] = 'false' if settings[:reduce_speculative].nil?
104
104
  end
105
105
 
106
106
  super() + [].tap do |o|
@@ -164,5 +164,5 @@ module Wukong
164
164
  end
165
165
  end
166
166
 
167
- Hadoop::Driver.class_eval { include Elasticsearch::HadoopInvocationOverride }
167
+ Hadoop::HadoopRunner.class_eval { include Elasticsearch::HadoopInvocationOverride }
168
168
  end
@@ -26,7 +26,7 @@ module Wukong
26
26
  # @param [String]
27
27
  attr_reader :mapping
28
28
 
29
- # Does the given +string+ look like a possible Elasticsearch
29
+ # Does the given `string` look like a possible Elasticsearch
30
30
  # /index/mapping specification?
31
31
  #
32
32
  # @param [String] string
@@ -1,4 +1,4 @@
1
1
  module Wonderdog
2
2
  # The currently running Wonderdog version
3
- VERSION = '0.0.2'
3
+ VERSION = '0.1.0'
4
4
  end
@@ -1,12 +1,10 @@
1
1
  require 'wonderdog'
2
2
  require 'wukong/spec_helpers'
3
- require_relative('support/integration_helper')
4
- require_relative('support/driver_helper')
5
-
6
3
 
7
4
  RSpec.configure do |config|
8
5
 
9
6
  config.before(:each) do
7
+ Wukong::Log.level = Log4r::OFF
10
8
  @orig_reg = Wukong.registry.show
11
9
  end
12
10
 
@@ -14,9 +12,18 @@ RSpec.configure do |config|
14
12
  Wukong.registry.clear!
15
13
  Wukong.registry.merge!(@orig_reg)
16
14
  end
17
-
15
+
18
16
  include Wukong::SpecHelpers
19
- include Wukong::Elasticsearch::IntegrationHelper
20
- include Wukong::Elasticsearch::DriverHelper
21
- end
17
+
18
+ def root
19
+ @root ||= Pathname.new(File.expand_path('../..', __FILE__))
20
+ end
22
21
 
22
+ def hadoop_runner *args, &block
23
+ runner(Wukong::Hadoop::HadoopRunner, 'wu-hadoop', *args) do
24
+ stub!(:execute_command!)
25
+ instance_eval(&block) if block_given?
26
+ end
27
+ end
28
+
29
+ end
File without changes
@@ -2,10 +2,10 @@ require 'spec_helper'
2
2
 
3
3
  describe Wukong::Elasticsearch::HadoopInvocationOverride do
4
4
 
5
- let(:no_es) { driver('regexp', 'count', input: '/tmp/input_file', output: '/tmp/output_file') }
6
- let(:es_reader) { driver('regexp', 'count', input: 'es://the_index/the_map', output: '/tmp/output_file') }
7
- let(:es_writer) { driver('regexp', 'count', input: '/tmp/input_file', output: 'es:///the_index/the_map') }
8
- let(:es_complex) { driver('regexp', 'count', input: 'es://the_index/the_map', output: 'es:///the_index/the_map', es_query: '{"hi": "there"}', es_request_size: 1000, es_index_field: 'ID') }
5
+ let(:no_es) { hadoop_runner('regexp', 'count', input: '/tmp/input_file', output: '/tmp/output_file') }
6
+ let(:es_reader) { hadoop_runner('regexp', 'count', input: 'es://the_index/the_map', output: '/tmp/output_file') }
7
+ let(:es_writer) { hadoop_runner('regexp', 'count', input: '/tmp/input_file', output: 'es:///the_index/the_map') }
8
+ let(:es_complex) { hadoop_runner('regexp', 'count', input: 'es://the_index/the_map', output: 'es:///the_index/the_map', es_query: '{"hi": "there"}', es_request_size: 1000, es_index_field: 'ID', map_speculative: true, reduce_speculative: true) }
9
9
 
10
10
  context "passing necessary jars to Hadoop streaming" do
11
11
  before { Dir.stub!(:[]).and_return(["/lib/dir/elasticsearch.jar"], ["/lib/dir/wonderdog.jar"]) }
@@ -36,16 +36,32 @@ describe Wukong::Elasticsearch::HadoopInvocationOverride do
36
36
  context "setting speculative execution" do
37
37
  context "when not given speculative options" do
38
38
  context "and not interacting with Elasticsearch" do
39
- it "doesn't add jars" do
39
+ it "doesn't add any speculative options" do
40
40
  no_es.hadoop_commandline.should_not match('speculative')
41
41
  end
42
42
  end
43
43
  context "and reading from Elasticsearch" do
44
- it "adds default jars it finds on the local filesystem" do
45
- es_reader.hadoop_commandline.should match('-mapred.map.tasks.speculative.execution.*false')
46
- es_reader.hadoop_commandline.should match('-mapred.reduce.tasks.speculative.execution.*false')
44
+ it "disables speculative execution in the mapper" do
45
+ es_reader.hadoop_commandline.should match(/-D mapred.map.tasks.speculative.execution.*false/)
46
+ end
47
+ it "disables speculative execution in the reducer" do
48
+ es_reader.hadoop_commandline.should match(/-D mapred.reduce.tasks.speculative.execution.*false/)
47
49
  end
48
50
  end
51
+ context "and reading from Elasticsearch" do
52
+ it "disables speculative execution in the mapper" do
53
+ es_writer.hadoop_commandline.should match(/-D mapred.map.tasks.speculative.execution.*false/)
54
+ end
55
+ it "disables speculative execution in the reducer" do
56
+ es_writer.hadoop_commandline.should match(/-D mapred.reduce.tasks.speculative.execution.*false/)
57
+ end
58
+ end
59
+ end
60
+ context "when given speculative options" do
61
+ it "does not change them" do
62
+ es_complex.hadoop_commandline.should match(/-D mapred.map.tasks.speculative.execution.*true/)
63
+ es_complex.hadoop_commandline.should match(/-D mapred.reduce.tasks.speculative.execution.*true/)
64
+ end
49
65
  end
50
66
  end
51
67
 
@@ -0,0 +1,18 @@
1
+ require 'spec_helper'
2
+
3
+ describe 'wu-hadoop' do
4
+
5
+ context "when wonderdog hasn't been required" do
6
+ let(:script) { examples_dir('no_wonderdog.rb') }
7
+ it "doesn't recognize Elasticsearch URIs" do
8
+ command('wu-hadoop', script, '--input=es://foo/bar', '--output=/some/path', '--dry_run').should_not have_stdout('elasticsearch')
9
+ end
10
+ end
11
+
12
+ context "when wonderdog hasn't been required" do
13
+ let(:script) { examples_dir('wonderdog.rb') }
14
+ it "recognizes Elasticsearch URIs" do
15
+ command('wu-hadoop', script, '--input=es://foo/bar', '--output=/some/path', '--dry_run').should have_stdout('elasticsearch')
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,5 @@
1
+ require 'spec_helper'
2
+
3
+ describe Wukong::Elasticsearch do
4
+ it_behaves_like 'a plugin'
5
+ end
@@ -28,6 +28,5 @@ EOF
28
28
  gem.test_files = gem.files.grep(/^spec/)
29
29
  gem.require_paths = ['lib']
30
30
 
31
- gem.add_dependency('wukong', '3.0.0.pre3')
32
- gem.add_dependency('wukong-hadoop', '>= 0.0.2')
31
+ gem.add_dependency('wukong-hadoop', '0.1.0')
33
32
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wonderdog
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.2
4
+ version: 0.1.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -13,16 +13,16 @@ authors:
13
13
  autorequire:
14
14
  bindir: bin
15
15
  cert_chain: []
16
- date: 2012-12-17 00:00:00.000000000 Z
16
+ date: 2013-02-20 00:00:00.000000000 Z
17
17
  dependencies:
18
18
  - !ruby/object:Gem::Dependency
19
- name: wukong
19
+ name: wukong-hadoop
20
20
  requirement: !ruby/object:Gem::Requirement
21
21
  none: false
22
22
  requirements:
23
23
  - - '='
24
24
  - !ruby/object:Gem::Version
25
- version: 3.0.0.pre3
25
+ version: 0.1.0
26
26
  type: :runtime
27
27
  prerelease: false
28
28
  version_requirements: !ruby/object:Gem::Requirement
@@ -30,23 +30,7 @@ dependencies:
30
30
  requirements:
31
31
  - - '='
32
32
  - !ruby/object:Gem::Version
33
- version: 3.0.0.pre3
34
- - !ruby/object:Gem::Dependency
35
- name: wukong-hadoop
36
- requirement: !ruby/object:Gem::Requirement
37
- none: false
38
- requirements:
39
- - - ! '>='
40
- - !ruby/object:Gem::Version
41
- version: 0.0.2
42
- type: :runtime
43
- prerelease: false
44
- version_requirements: !ruby/object:Gem::Requirement
45
- none: false
46
- requirements:
47
- - - ! '>='
48
- - !ruby/object:Gem::Version
49
- version: 0.0.2
33
+ version: 0.1.0
50
34
  description: ! " Wonderdog provides code in both Ruby and Java to make Elasticsearch\n
51
35
  \ a more fully-fledged member of both the Hadoop and Wukong\n ecosystems.\n\n For
52
36
  the Java side, Wonderdog provides InputFormat and OutputFormat\n classes for use
@@ -60,9 +44,9 @@ extra_rdoc_files: []
60
44
  files:
61
45
  - .gitignore
62
46
  - .rspec
47
+ - .yardopts
63
48
  - CHANGELOG.md
64
49
  - Gemfile
65
- - Gemfile.lock
66
50
  - LICENSE.md
67
51
  - README.md
68
52
  - Rakefile
@@ -75,8 +59,9 @@ files:
75
59
  - config/more_settings.yml
76
60
  - config/run_elasticsearch-2.sh
77
61
  - config/ufo_config.json
62
+ - examples/no_wonderdog.rb
63
+ - examples/wonderdog.rb
78
64
  - lib/wonderdog.rb
79
- - lib/wonderdog/configuration.rb
80
65
  - lib/wonderdog/hadoop_invocation_override.rb
81
66
  - lib/wonderdog/index_and_mapping.rb
82
67
  - lib/wonderdog/timestamp.rb
@@ -89,10 +74,11 @@ files:
89
74
  - notes/pigstorefunc.pig
90
75
  - pom.xml
91
76
  - spec/spec_helper.rb
92
- - spec/support/driver_helper.rb
93
- - spec/support/integration_helper.rb
77
+ - spec/support/.gitkeep
94
78
  - spec/wonderdog/hadoop_invocation_override_spec.rb
95
79
  - spec/wonderdog/index_and_type_spec.rb
80
+ - spec/wonderdog/wu-hadoop_spec.rb
81
+ - spec/wonderdog_spec.rb
96
82
  - src/main/java/com/infochimps/elasticsearch/ElasticSearchInputFormat.java
97
83
  - src/main/java/com/infochimps/elasticsearch/ElasticSearchOutputCommitter.java
98
84
  - src/main/java/com/infochimps/elasticsearch/ElasticSearchOutputFormat.java
@@ -141,8 +127,9 @@ specification_version: 3
141
127
  summary: Make Hadoop and ElasticSearch play together nicely.
142
128
  test_files:
143
129
  - spec/spec_helper.rb
144
- - spec/support/driver_helper.rb
145
- - spec/support/integration_helper.rb
130
+ - spec/support/.gitkeep
146
131
  - spec/wonderdog/hadoop_invocation_override_spec.rb
147
132
  - spec/wonderdog/index_and_type_spec.rb
133
+ - spec/wonderdog/wu-hadoop_spec.rb
134
+ - spec/wonderdog_spec.rb
148
135
  has_rdoc:
@@ -1,57 +0,0 @@
1
- PATH
2
- remote: .
3
- specs:
4
- wonderdog (0.0.1)
5
- wukong (= 3.0.0.pre3)
6
- wukong-hadoop (>= 0.0.2)
7
-
8
- GEM
9
- remote: http://rubygems.org/
10
- specs:
11
- configliere (0.4.18)
12
- highline (>= 1.5.2)
13
- multi_json (>= 1.1)
14
- diff-lcs (1.1.3)
15
- eventmachine (1.0.0)
16
- forgery (0.5.0)
17
- gorillib (0.4.2)
18
- configliere (>= 0.4.13)
19
- json
20
- multi_json (>= 1.1)
21
- highline (1.6.15)
22
- json (1.7.5)
23
- log4r (1.1.10)
24
- multi_json (1.5.0)
25
- rake (0.9.6)
26
- rspec (2.12.0)
27
- rspec-core (~> 2.12.0)
28
- rspec-expectations (~> 2.12.0)
29
- rspec-mocks (~> 2.12.0)
30
- rspec-core (2.12.2)
31
- rspec-expectations (2.12.1)
32
- diff-lcs (~> 1.1.3)
33
- rspec-mocks (2.12.0)
34
- uuidtools (2.1.3)
35
- vayacondios-client (0.1.2)
36
- configliere (>= 0.4.16)
37
- gorillib (~> 0.4.2)
38
- multi_json (~> 1.1)
39
- wukong (3.0.0.pre3)
40
- configliere (>= 0.4.18)
41
- eventmachine
42
- forgery
43
- gorillib (>= 0.4.2)
44
- log4r
45
- multi_json (>= 1.3.6)
46
- uuidtools
47
- vayacondios-client (>= 0.1.2)
48
- wukong-hadoop (0.0.2)
49
- wukong (= 3.0.0.pre3)
50
-
51
- PLATFORMS
52
- ruby
53
-
54
- DEPENDENCIES
55
- rake (~> 0.9)
56
- rspec (~> 2)
57
- wonderdog!
@@ -1,26 +0,0 @@
1
- module Wukong
2
- module Elasticsearch
3
-
4
- # Configure the given +settings+ to be able to work with
5
- # Elasticsearch.
6
- #
7
- # @param [Configliere::Param] settings
8
- # @return [Configliere::Param] the newly configured settings
9
- def self.configure settings
10
- settings.define(:es_tmp_dir, :description => "Temporary directory on the HDFS to store job files while reading/writing to ElasticSearch", :default => "/user/#{ENV['USER']}/wukong", :wukong_hadoop => true)
11
- settings.define(:es_lib_dir, :description => "Directory containing Elasticsearch, Wonderdog, and other support jars", :default => "/usr/lib/hadoop/lib", :wukong_hadoop => true)
12
- settings.define(:es_config, :description => "Where to find configuration files detailing how to join an ElasticSearch cluster", :wukong_hadoop => true)
13
- settings.define(:es_input_splits, :description => "Number of input splits to target when reading from ElasticSearch", :type => Integer, :wukong_hadoop => true)
14
- settings.define(:es_request_size, :description => "Number of objects requested during each batch read from ElasticSearch", :type => Integer, :wukong_hadoop => true)
15
- settings.define(:es_scroll_timeout, :description => "Amount of time to wait on a scroll", :wukong_hadoop => true)
16
- settings.define(:es_index_field, :description => "Field to use from each record to override the default index", :wukong_hadoop => true)
17
- settings.define(:es_mapping_field, :description => "Field to use from each record to override the default mapping", :wukong_hadoop => true)
18
- settings.define(:es_id_field, :description => "If this field is present in a record, make an update request, otherwise make a create request", :wukong_hadoop => true)
19
- settings.define(:es_bulk_size, :description => "Number of requests to batch locally before making a request to ElasticSearch", :type => Integer, :wukong_hadoop => true)
20
- settings.define(:es_query, :description => "Query to use when defining input splits for ElasticSearch input", :wukong_hadoop => true)
21
-
22
- settings
23
- end
24
- end
25
-
26
- end
@@ -1,15 +0,0 @@
1
- module Wukong
2
- module Elasticsearch
3
- module DriverHelper
4
-
5
- def driver *args
6
- params = Elasticsearch.configure(Hadoop.configure(Configliere::Param.new))
7
- params.resolve!
8
- params.merge!(args.pop) if args.last.is_a?(Hash)
9
- Hadoop::Driver.new(params, *args)
10
- end
11
-
12
- end
13
- end
14
- end
15
-
@@ -1,30 +0,0 @@
1
- module Wukong
2
- module Elasticsearch
3
- module IntegrationHelper
4
-
5
- def root
6
- @root ||= Pathname.new(File.expand_path('../../..', __FILE__))
7
- end
8
-
9
- def lib_dir
10
- root.join('lib')
11
- end
12
-
13
- def bin_dir
14
- root.join('bin')
15
- end
16
-
17
- def integration_env
18
- {
19
- "RUBYLIB" => [lib_dir.to_s, ENV["RUBYLIB"]].compact.join(':')
20
- }
21
- end
22
-
23
- def integration_cwd
24
- root.to_s
25
- end
26
-
27
- end
28
- end
29
- end
30
-