sandofsky-ralph 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,23 @@
1
+ # -*- ruby -*-
2
+
3
+ require 'autotest/restart'
4
+
5
+ # Autotest.add_hook :initialize do |at|
6
+ # at.extra_files << "../some/external/dependency.rb"
7
+ #
8
+ # at.libs << ":../some/external"
9
+ #
10
+ # at.add_exception 'vendor'
11
+ #
12
+ # at.add_mapping(/dependency.rb/) do |f, _|
13
+ # at.files_matching(/test_.*rb$/)
14
+ # end
15
+ #
16
+ # %w(TestA TestB).each do |klass|
17
+ # at.extra_class_map[klass] = "test/test_misc.rb"
18
+ # end
19
+ # end
20
+
21
+ # Autotest.add_hook :run_command do |at|
22
+ # system "rake build"
23
+ # end
@@ -0,0 +1,6 @@
1
+ === 1.0.0 / 2009-08-12
2
+
3
+ * 1 major enhancement
4
+
5
+ * Birthday!
6
+
@@ -0,0 +1,9 @@
1
+ .autotest
2
+ History.txt
3
+ Manifest.txt
4
+ README.txt
5
+ Rakefile
6
+ bin/ralph
7
+ lib/iterator.rb
8
+ lib/ralph.rb
9
+ test/test_ralph.rb
@@ -0,0 +1,76 @@
1
+ = ralph
2
+
3
+ * http://github.com/sandofsky/ralph
4
+
5
+ == DESCRIPTION:
6
+
7
+ A pretty, simple wrapper for writing Hadoop Streaming jobs in Ruby.
8
+
9
+ == FEATURES/PROBLEMS:
10
+
11
+ * First version.
12
+
13
+ == SYNOPSIS:
14
+
15
+ Define your mappers and reducers in Ruby with the help of a domain specific language.
16
+
17
+ Ralph.job do
18
+
19
+ mapper do |row|
20
+ row.split(/\s/).each do |word|
21
+ emit word, 1
22
+ end
23
+ end
24
+
25
+ reducer do |key, values|
26
+ total = values.inject(0) {|t, i| t + i.to_i }
27
+ emit key, total
28
+ end
29
+
30
+ end
31
+
32
+ Then run you task with:
33
+
34
+ ralph fire word_count.rb input.csv output
35
+
36
+ Which would translate into:
37
+
38
+ hadoop jar $HADOOP_HOME/contrib/streaming/hadoop-*-streaming.jar \
39
+ -input input.csv \
40
+ -output output \
41
+ -mapper "ralph map word_count.rb" \
42
+ -reducer "ralph reduce word_count.rb" \
43
+ -file word_count.rb
44
+
45
+ == REQUIREMENTS:
46
+
47
+ * Make sure to export your $HADOOP_HOME as the path to hadoop.
48
+
49
+ == INSTALL:
50
+
51
+ * sudo gem install sandofsky-ralph --source http://gems.github.com
52
+
53
+ == LICENSE:
54
+
55
+ (The MIT License)
56
+
57
+ Copyright (c) 2009 FIX
58
+
59
+ Permission is hereby granted, free of charge, to any person obtaining
60
+ a copy of this software and associated documentation files (the
61
+ 'Software'), to deal in the Software without restriction, including
62
+ without limitation the rights to use, copy, modify, merge, publish,
63
+ distribute, sublicense, and/or sell copies of the Software, and to
64
+ permit persons to whom the Software is furnished to do so, subject to
65
+ the following conditions:
66
+
67
+ The above copyright notice and this permission notice shall be
68
+ included in all copies or substantial portions of the Software.
69
+
70
+ THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
71
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
72
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
73
+ IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
74
+ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
75
+ TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
76
+ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,13 @@
1
+ # -*- ruby -*-
2
+
3
+ require 'rubygems'
4
+ require 'hoe'
5
+
6
+ Hoe.spec 'ralph' do
7
+ developer('Ben Sandofsky', 'sandofsky@gmail.com')
8
+ extra_deps << ['fastercsv', '~> 1.4.0']
9
+ extra_deps << ['sandofsky-csvscan', '>= 0.1.0']
10
+ extra_deps << ['wycats-thor', '>= 0.11.5']
11
+ end
12
+
13
+ # vim: syntax=ruby
@@ -0,0 +1,60 @@
1
+ #!/usr/bin/env ruby
2
+ require 'rubygems'
3
+ require 'thor'
4
+ require File.expand_path(File.dirname(__FILE__) + "/../lib/ralph")
5
+ class RalphCLI < Thor
6
+ desc "fire <script.rb>", "run via hadooop streaming"
7
+ def fire(script, input, output = nil)
8
+ output ||= "#{input}_output"
9
+ hadoop_command = <<-END
10
+ #{ENV["HADOOP_HOME"]}/bin/hadoop jar #{ENV["HADOOP_HOME"]}/contrib/streaming/hadoop-*-streaming.jar \
11
+ -input #{input} \
12
+ -output #{output} \
13
+ -mapper "ralph map #{script}" \
14
+ -reducer "ralph reduce #{script}" \
15
+ -file #{script}
16
+ END
17
+ IO.popen(hadoop_command, 'r+') do |io|
18
+ until io.eof?
19
+ puts io.readline
20
+ end
21
+ end
22
+ end
23
+
24
+ desc "simulate <script.rb>", "simulates a mapreduce job through unix pipes and 'sort'"
25
+ def simulate(script)
26
+ require script
27
+ sorter = IO.popen("sort", 'w+')
28
+ threads = []
29
+ threads << Thread.new(sorter) do |s|
30
+ Ralph.run_mapper!(STDIN, s)
31
+ s.close_write
32
+ end
33
+ if Ralph::Job.reducer?
34
+ threads << Thread.new(sorter) do |s|
35
+ Ralph.run_reducer!(s, STDOUT)
36
+ end
37
+ else
38
+ threads << Thread.new(sorter) do |s|
39
+ until s.eof?
40
+ puts s.readline
41
+ end
42
+ end
43
+ end
44
+ threads.each {|t| t.join}
45
+ end
46
+
47
+ desc "map <script.rb>", "Run mapper"
48
+ def map(script)
49
+ require script
50
+ Ralph.run_mapper!
51
+ end
52
+
53
+ desc "reduce <script.rb>", "Run reducer"
54
+ def reduce(script)
55
+ require script
56
+ Ralph.run_reducer!
57
+ end
58
+
59
+ end
60
+ RalphCLI.start
@@ -0,0 +1,67 @@
1
+ class Ralph
2
+ module Iterator
3
+
4
+ @@input = STDIN
5
+ def self.input=(i)
6
+ @@input = i
7
+ end
8
+
9
+ def self.input
10
+ @@input
11
+ end
12
+
13
+ @@last_key = nil
14
+ @@current_key = nil
15
+ @@current_value = nil
16
+
17
+ @@has_iterated = nil
18
+ def self.ready!
19
+ @@has_iterated = false
20
+ end
21
+
22
+ def self.current_key
23
+ @@current_key
24
+ end
25
+
26
+ def self.start_of_group?
27
+ @@last_key != @@current_key
28
+ end
29
+
30
+ def self.next_group!
31
+ until start_of_group? || input.eof?
32
+ grab!
33
+ end
34
+ end
35
+
36
+ def self.each
37
+ repeat_iteration_check
38
+ c = @@current_key.dup
39
+ until c != @@current_key || input.eof?
40
+ yield @@current_value
41
+ grab!
42
+ end
43
+ @@has_iterated = true
44
+ end
45
+
46
+ def self.inject(s = nil)
47
+ repeat_iteration_check
48
+ c = @@current_key.dup
49
+ until c != @@current_key || input.eof?
50
+ s = yield s, @@current_value
51
+ grab!
52
+ end
53
+ @@has_iterated = true
54
+ s
55
+ end
56
+
57
+ def self.repeat_iteration_check
58
+ raise "You cannot call the value iterator more than once in a reduce" if @@has_iterated
59
+ end
60
+
61
+ def self.grab!
62
+ @@last_key = @@current_key.dup unless @@current_key.nil?
63
+ @@current_key, @@current_value = FasterCSV.parse_line(input.readline, :col_sep => "\t")
64
+ end
65
+
66
+ end
67
+ end
@@ -0,0 +1,59 @@
1
+ require 'csvscan'
2
+ require 'fastercsv'
3
+ require File.dirname(__FILE__) + '/iterator'
4
+ class Ralph
5
+ VERSION = '0.1.0'
6
+
7
+ @@map_input = nil
8
+ class Job
9
+ attr_accessor :output
10
+ def emit(key, value)
11
+ output << FasterCSV.generate_line((value ? [key, value] : [key]), :col_sep => "\t")
12
+ end
13
+
14
+ def self.reducer?
15
+ instance_methods.include?('reduce')
16
+ end
17
+
18
+ end
19
+
20
+ def self.mapper(format = nil, &block)
21
+ Job.send(:define_method, :map, &block)
22
+ @@map_input = format
23
+ end
24
+
25
+ def self.reducer(&block)
26
+ Job.send(:define_method, :reduce, &block)
27
+ end
28
+
29
+ def self.run_mapper!(input = STDIN, output = STDOUT)
30
+ j = Job.new
31
+ j.output = output
32
+ if @@map_input == :csv
33
+ CSVScan.scan(input) do |row|
34
+ j.map(row)
35
+ end
36
+ else
37
+ input.each_line do |line|
38
+ j.map(line)
39
+ end
40
+ end
41
+ end
42
+
43
+ def self.run_reducer!(input = STDIN, output = STDOUT)
44
+ j = Job.new
45
+ j.output = output
46
+ Iterator.input = input
47
+ Iterator.grab! unless input.eof?
48
+ until input.eof?
49
+ Iterator.ready!
50
+ j.reduce(Iterator.current_key, Iterator)
51
+ Iterator.next_group! unless Iterator.start_of_group?
52
+ end
53
+ end
54
+
55
+ def self.job(&block)
56
+ self.instance_eval(&block) if block_given?# setup
57
+ end
58
+
59
+ end
@@ -0,0 +1,8 @@
1
+ require "test/unit"
2
+ require "ralph"
3
+
4
+ class TestRalph < Test::Unit::TestCase
5
+ def test_sanity
6
+ flunk "write tests or I will kneecap you"
7
+ end
8
+ end
metadata ADDED
@@ -0,0 +1,105 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: sandofsky-ralph
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Ben Sandofsky
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2009-08-14 00:00:00 -07:00
13
+ default_executable: ralph
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: fastercsv
17
+ type: :runtime
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ~>
22
+ - !ruby/object:Gem::Version
23
+ version: 1.4.0
24
+ version:
25
+ - !ruby/object:Gem::Dependency
26
+ name: sandofsky-csvscan
27
+ type: :runtime
28
+ version_requirement:
29
+ version_requirements: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: 0.1.0
34
+ version:
35
+ - !ruby/object:Gem::Dependency
36
+ name: wycats-thor
37
+ type: :runtime
38
+ version_requirement:
39
+ version_requirements: !ruby/object:Gem::Requirement
40
+ requirements:
41
+ - - ">="
42
+ - !ruby/object:Gem::Version
43
+ version: 0.11.5
44
+ version:
45
+ - !ruby/object:Gem::Dependency
46
+ name: hoe
47
+ type: :development
48
+ version_requirement:
49
+ version_requirements: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - ">="
52
+ - !ruby/object:Gem::Version
53
+ version: 2.3.3
54
+ version:
55
+ description: A pretty, simple wrapper for writing Hadoop Streaming jobs in Ruby.
56
+ email:
57
+ - sandofsky@gmail.com
58
+ executables:
59
+ - ralph
60
+ extensions: []
61
+
62
+ extra_rdoc_files:
63
+ - History.txt
64
+ - Manifest.txt
65
+ - README.txt
66
+ files:
67
+ - .autotest
68
+ - History.txt
69
+ - Manifest.txt
70
+ - README.txt
71
+ - Rakefile
72
+ - bin/ralph
73
+ - lib/iterator.rb
74
+ - lib/ralph.rb
75
+ - test/test_ralph.rb
76
+ has_rdoc: false
77
+ homepage: http://github.com/sandofsky/ralph
78
+ licenses:
79
+ post_install_message:
80
+ rdoc_options:
81
+ - --main
82
+ - README.txt
83
+ require_paths:
84
+ - lib
85
+ required_ruby_version: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - ">="
88
+ - !ruby/object:Gem::Version
89
+ version: "0"
90
+ version:
91
+ required_rubygems_version: !ruby/object:Gem::Requirement
92
+ requirements:
93
+ - - ">="
94
+ - !ruby/object:Gem::Version
95
+ version: "0"
96
+ version:
97
+ requirements: []
98
+
99
+ rubyforge_project: ralph
100
+ rubygems_version: 1.3.5
101
+ signing_key:
102
+ specification_version: 3
103
+ summary: A pretty, simple wrapper for writing Hadoop Streaming jobs in Ruby.
104
+ test_files:
105
+ - test/test_ralph.rb