sandofsky-ralph 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,23 @@
1
+ # -*- ruby -*-
2
+
3
+ require 'autotest/restart'
4
+
5
+ # Autotest.add_hook :initialize do |at|
6
+ # at.extra_files << "../some/external/dependency.rb"
7
+ #
8
+ # at.libs << ":../some/external"
9
+ #
10
+ # at.add_exception 'vendor'
11
+ #
12
+ # at.add_mapping(/dependency.rb/) do |f, _|
13
+ # at.files_matching(/test_.*rb$/)
14
+ # end
15
+ #
16
+ # %w(TestA TestB).each do |klass|
17
+ # at.extra_class_map[klass] = "test/test_misc.rb"
18
+ # end
19
+ # end
20
+
21
+ # Autotest.add_hook :run_command do |at|
22
+ # system "rake build"
23
+ # end
@@ -0,0 +1,6 @@
1
+ === 1.0.0 / 2009-08-12
2
+
3
+ * 1 major enhancement
4
+
5
+ * Birthday!
6
+
@@ -0,0 +1,9 @@
1
+ .autotest
2
+ History.txt
3
+ Manifest.txt
4
+ README.txt
5
+ Rakefile
6
+ bin/ralph
7
+ lib/iterator.rb
8
+ lib/ralph.rb
9
+ test/test_ralph.rb
@@ -0,0 +1,76 @@
1
+ = ralph
2
+
3
+ * http://github.com/sandofsky/ralph
4
+
5
+ == DESCRIPTION:
6
+
7
+ A pretty, simple wrapper for writing Hadoop Streaming jobs in Ruby.
8
+
9
+ == FEATURES/PROBLEMS:
10
+
11
+ * First version.
12
+
13
+ == SYNOPSIS:
14
+
15
+ Define your mappers and reducers in Ruby with the help of a domain specific language.
16
+
17
+ Ralph.job do
18
+
19
+ mapper do |row|
20
+ row.split(/\s/).each do |word|
21
+ emit word, 1
22
+ end
23
+ end
24
+
25
+ reducer do |key, values|
26
+ total = values.inject(0) {|t, i| t + i.to_i }
27
+ emit key, total
28
+ end
29
+
30
+ end
31
+
32
+ Then run you task with:
33
+
34
+ ralph fire word_count.rb input.csv output
35
+
36
+ Which would translate into:
37
+
38
+ hadoop jar $HADOOP_HOME/contrib/streaming/hadoop-*-streaming.jar \
39
+ -input input.csv \
40
+ -output output \
41
+ -mapper "ralph map word_count.rb" \
42
+ -reducer "ralph reduce word_count.rb" \
43
+ -file word_count.rb
44
+
45
+ == REQUIREMENTS:
46
+
47
+ * Make sure to export your $HADOOP_HOME as the path to hadoop.
48
+
49
+ == INSTALL:
50
+
51
+ * sudo gem install sandofsky-ralph --source http://gems.github.com
52
+
53
+ == LICENSE:
54
+
55
+ (The MIT License)
56
+
57
+ Copyright (c) 2009 FIX
58
+
59
+ Permission is hereby granted, free of charge, to any person obtaining
60
+ a copy of this software and associated documentation files (the
61
+ 'Software'), to deal in the Software without restriction, including
62
+ without limitation the rights to use, copy, modify, merge, publish,
63
+ distribute, sublicense, and/or sell copies of the Software, and to
64
+ permit persons to whom the Software is furnished to do so, subject to
65
+ the following conditions:
66
+
67
+ The above copyright notice and this permission notice shall be
68
+ included in all copies or substantial portions of the Software.
69
+
70
+ THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
71
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
72
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
73
+ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
74
+ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
75
+ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
76
+ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,13 @@
1
+ # -*- ruby -*-
2
+
3
+ require 'rubygems'
4
+ require 'hoe'
5
+
6
+ Hoe.spec 'ralph' do
7
+ developer('Ben Sandofsky', 'sandofsky@gmail.com')
8
+ extra_deps << ['fastercsv', '~> 1.4.0']
9
+ extra_deps << ['sandofsky-csvscan', '>= 0.1.0']
10
+ extra_deps << ['wycats-thor', '>= 0.11.5']
11
+ end
12
+
13
+ # vim: syntax=ruby
@@ -0,0 +1,60 @@
1
+ #!/usr/bin/env ruby
2
+ require 'rubygems'
3
+ require 'thor'
4
+ require File.expand_path(File.dirname(__FILE__) + "/../lib/ralph")
5
+ class RalphCLI < Thor
6
+ desc "fire <script.rb>", "run via hadooop streaming"
7
+ def fire(script, input, output = nil)
8
+ output ||= "#{input}_output"
9
+ hadoop_command = <<-END
10
+ #{ENV["HADOOP_HOME"]}/bin/hadoop jar #{ENV["HADOOP_HOME"]}/contrib/streaming/hadoop-*-streaming.jar \
11
+ -input #{input} \
12
+ -output #{output} \
13
+ -mapper "ralph map #{script}" \
14
+ -reducer "ralph reduce #{script}" \
15
+ -file #{script}
16
+ END
17
+ IO.popen(hadoop_command, 'r+') do |io|
18
+ until io.eof?
19
+ puts io.readline
20
+ end
21
+ end
22
+ end
23
+
24
+ desc "simulate <script.rb>", "simulates a mapreduce job through unix pipes and 'sort'"
25
+ def simulate(script)
26
+ require script
27
+ sorter = IO.popen("sort", 'w+')
28
+ threads = []
29
+ threads << Thread.new(sorter) do |s|
30
+ Ralph.run_mapper!(STDIN, s)
31
+ s.close_write
32
+ end
33
+ if Ralph::Job.reducer?
34
+ threads << Thread.new(sorter) do |s|
35
+ Ralph.run_reducer!(s, STDOUT)
36
+ end
37
+ else
38
+ threads << Thread.new(sorter) do |s|
39
+ until s.eof?
40
+ puts s.readline
41
+ end
42
+ end
43
+ end
44
+ threads.each {|t| t.join}
45
+ end
46
+
47
+ desc "map <script.rb>", "Run mapper"
48
+ def map(script)
49
+ require script
50
+ Ralph.run_mapper!
51
+ end
52
+
53
+ desc "reduce <script.rb>", "Run reducer"
54
+ def reduce(script)
55
+ require script
56
+ Ralph.run_reducer!
57
+ end
58
+
59
+ end
60
+ RalphCLI.start
@@ -0,0 +1,67 @@
1
+ class Ralph
2
+ module Iterator
3
+
4
+ @@input = STDIN
5
+ def self.input=(i)
6
+ @@input = i
7
+ end
8
+
9
+ def self.input
10
+ @@input
11
+ end
12
+
13
+ @@last_key = nil
14
+ @@current_key = nil
15
+ @@current_value = nil
16
+
17
+ @@has_iterated = nil
18
+ def self.ready!
19
+ @@has_iterated = false
20
+ end
21
+
22
+ def self.current_key
23
+ @@current_key
24
+ end
25
+
26
+ def self.start_of_group?
27
+ @@last_key != @@current_key
28
+ end
29
+
30
+ def self.next_group!
31
+ until start_of_group? || input.eof?
32
+ grab!
33
+ end
34
+ end
35
+
36
+ def self.each
37
+ repeat_iteration_check
38
+ c = @@current_key.dup
39
+ until c != @@current_key || input.eof?
40
+ yield @@current_value
41
+ grab!
42
+ end
43
+ @@has_iterated = true
44
+ end
45
+
46
+ def self.inject(s = nil)
47
+ repeat_iteration_check
48
+ c = @@current_key.dup
49
+ until c != @@current_key || input.eof?
50
+ s = yield s, @@current_value
51
+ grab!
52
+ end
53
+ @@has_iterated = true
54
+ s
55
+ end
56
+
57
+ def self.repeat_iteration_check
58
+ raise "You cannot call the value iterator more than once in a reduce" if @@has_iterated
59
+ end
60
+
61
+ def self.grab!
62
+ @@last_key = @@current_key.dup unless @@current_key.nil?
63
+ @@current_key, @@current_value = FasterCSV.parse_line(input.readline, :col_sep => "\t")
64
+ end
65
+
66
+ end
67
+ end
@@ -0,0 +1,59 @@
1
+ require 'csvscan'
2
+ require 'fastercsv'
3
+ require File.dirname(__FILE__) + '/iterator'
4
+ class Ralph
5
+ VERSION = '0.1.0'
6
+
7
+ @@map_input = nil
8
+ class Job
9
+ attr_accessor :output
10
+ def emit(key, value)
11
+ output << FasterCSV.generate_line((value ? [key, value] : [key]), :col_sep => "\t")
12
+ end
13
+
14
+ def self.reducer?
15
+ instance_methods.include?('reduce')
16
+ end
17
+
18
+ end
19
+
20
+ def self.mapper(format = nil, &block)
21
+ Job.send(:define_method, :map, &block)
22
+ @@map_input = format
23
+ end
24
+
25
+ def self.reducer(&block)
26
+ Job.send(:define_method, :reduce, &block)
27
+ end
28
+
29
+ def self.run_mapper!(input = STDIN, output = STDOUT)
30
+ j = Job.new
31
+ j.output = output
32
+ if @@map_input == :csv
33
+ CSVScan.scan(input) do |row|
34
+ j.map(row)
35
+ end
36
+ else
37
+ input.each_line do |line|
38
+ j.map(line)
39
+ end
40
+ end
41
+ end
42
+
43
+ def self.run_reducer!(input = STDIN, output = STDOUT)
44
+ j = Job.new
45
+ j.output = output
46
+ Iterator.input = input
47
+ Iterator.grab! unless input.eof?
48
+ until input.eof?
49
+ Iterator.ready!
50
+ j.reduce(Iterator.current_key, Iterator)
51
+ Iterator.next_group! unless Iterator.start_of_group?
52
+ end
53
+ end
54
+
55
+ def self.job(&block)
56
+ self.instance_eval(&block) if block_given?# setup
57
+ end
58
+
59
+ end
@@ -0,0 +1,8 @@
1
+ require "test/unit"
2
+ require "ralph"
3
+
4
+ class TestRalph < Test::Unit::TestCase
5
+ def test_sanity
6
+ flunk "write tests or I will kneecap you"
7
+ end
8
+ end
metadata ADDED
@@ -0,0 +1,105 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: sandofsky-ralph
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Ben Sandofsky
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2009-08-14 00:00:00 -07:00
13
+ default_executable: ralph
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: fastercsv
17
+ type: :runtime
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ~>
22
+ - !ruby/object:Gem::Version
23
+ version: 1.4.0
24
+ version:
25
+ - !ruby/object:Gem::Dependency
26
+ name: sandofsky-csvscan
27
+ type: :runtime
28
+ version_requirement:
29
+ version_requirements: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: 0.1.0
34
+ version:
35
+ - !ruby/object:Gem::Dependency
36
+ name: wycats-thor
37
+ type: :runtime
38
+ version_requirement:
39
+ version_requirements: !ruby/object:Gem::Requirement
40
+ requirements:
41
+ - - ">="
42
+ - !ruby/object:Gem::Version
43
+ version: 0.11.5
44
+ version:
45
+ - !ruby/object:Gem::Dependency
46
+ name: hoe
47
+ type: :development
48
+ version_requirement:
49
+ version_requirements: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - ">="
52
+ - !ruby/object:Gem::Version
53
+ version: 2.3.3
54
+ version:
55
+ description: A pretty, simple wrapper for writing Hadoop Streaming jobs in Ruby.
56
+ email:
57
+ - sandofsky@gmail.com
58
+ executables:
59
+ - ralph
60
+ extensions: []
61
+
62
+ extra_rdoc_files:
63
+ - History.txt
64
+ - Manifest.txt
65
+ - README.txt
66
+ files:
67
+ - .autotest
68
+ - History.txt
69
+ - Manifest.txt
70
+ - README.txt
71
+ - Rakefile
72
+ - bin/ralph
73
+ - lib/iterator.rb
74
+ - lib/ralph.rb
75
+ - test/test_ralph.rb
76
+ has_rdoc: false
77
+ homepage: http://github.com/sandofsky/ralph
78
+ licenses:
79
+ post_install_message:
80
+ rdoc_options:
81
+ - --main
82
+ - README.txt
83
+ require_paths:
84
+ - lib
85
+ required_ruby_version: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: "0"
90
+ version:
91
+ required_rubygems_version: !ruby/object:Gem::Requirement
92
+ requirements:
93
+ - - ">="
94
+ - !ruby/object:Gem::Version
95
+ version: "0"
96
+ version:
97
+ requirements: []
98
+
99
+ rubyforge_project: ralph
100
+ rubygems_version: 1.3.5
101
+ signing_key:
102
+ specification_version: 3
103
+ summary: A pretty, simple wrapper for writing Hadoop Streaming jobs in Ruby.
104
+ test_files:
105
+ - test/test_ralph.rb