rbpig 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/bin/rbpig ADDED
@@ -0,0 +1,26 @@
1
+ #!/usr/bin/env ruby
2
+ require 'rubygems'
3
+ require 'optparse'
4
+ require 'ostruct'
5
+ require 'rbpig'
6
+
7
+ options = OpenStruct.new
8
+
9
+ OptionParser.new do |opts|
10
+ opts.banner = "USAGE: rbpig [options]"
11
+
12
+ opts.on("-c", "--conf HADOOP_CONF", "Use this hadoop cluster xml config file.") do |config|
13
+ options.hadoop_config = config
14
+ end
15
+
16
+ opts.on("-m", "--mata-conf HIVE_CONF", "Use this hive metastore xml config file.") do |config|
17
+ options.hive_config = config
18
+ end
19
+
20
+ opts.on_tail("-h", "--help", "Show this message") do
21
+ puts opts
22
+ exit
23
+ end
24
+ end.parse!
25
+
26
+ exec(RBPig.executable(:hadoop_config => options.hadoop_config, :hive_config => options.hive_config))
Binary file
Binary file
Binary file
@@ -0,0 +1,23 @@
1
+ module RBPig
2
+ class Dataset
3
+ class << self
4
+ HIVE_CONFIG = {:database => "default", :database_root => "/user/hive/warehouse"}
5
+
6
+ def hive(table_name, storage_type = :text_file, config = {})
7
+ raise "storage type other than :text_file is not supported." unless storage_type == :text_file
8
+
9
+ config = HIVE_CONFIG.merge(:field_separator => "\\t").merge(config)
10
+ new("#{table_name} = LOAD '#{config[:database_root]}/#{table_name}' USING HiveTableLoader('#{config[:field_separator]}', '#{config[:database]}');")
11
+ end
12
+ end
13
+
14
+ def to_s
15
+ @load_script
16
+ end
17
+
18
+ private
19
+ def initialize(load_script)
20
+ @load_script = load_script
21
+ end
22
+ end
23
+ end
data/lib/rbpig.rb ADDED
@@ -0,0 +1,106 @@
1
+ require 'fileutils'
2
+ require 'rexml/document'
3
+ require File.join(File.dirname(__FILE__), "rbpig", "dataset")
4
+
5
+ module RBPig
6
+ class << self
7
+ def executable(configs)
8
+ configs = pig_configs(configs)
9
+
10
+ pig_options = ["-Dudf.import.list=forward.pig.storage"]
11
+ unless configs[:hadoop_config].nil?
12
+ hadoop_config = {}
13
+ REXML::Document.new(File.new(configs[:hadoop_config])).elements.each('configuration/property') do |property|
14
+ hadoop_config[property.elements[1].text] = property.elements[2].text
15
+ end
16
+ pig_options << "-Dfs.default.name=#{hadoop_config["fs.default.name"]}" if hadoop_config.has_key?("fs.default.name")
17
+ pig_options << "-Dmapred.job.tracker=#{hadoop_config["mapred.job.tracker"]}" if hadoop_config.has_key?("mapred.job.tracker")
18
+ end
19
+ ["PIG_CLASSPATH='#{pig_classpath(configs)}'", "PIG_OPTS='#{pig_options.join(" ")}'", "pig", "-l /tmp"].join(" ")
20
+ end
21
+
22
+ def connect(configs)
23
+ yield Pig.new(pig_configs(configs))
24
+ end
25
+
26
+ private
27
+ def pig_configs(configs)
28
+ {:hadoop_config => nil, :hive_config => nil}.merge(configs || {})
29
+ end
30
+
31
+ def pig_classpath(configs)
32
+ classpath = [
33
+ "#{File.join(File.dirname(__FILE__), %w[.. java dist porkchop.jar])}",
34
+ "#{File.join(File.dirname(__FILE__), %w[.. java lib hive hive-exec-0.5.0+32.jar])}",
35
+ "#{File.join(File.dirname(__FILE__), %w[.. java lib hive hive-metastore-0.5.0+32.jar])}",
36
+ "#{File.join(File.dirname(__FILE__), %w[.. java lib hive libfb303.jar])}",
37
+ "#{File.join(File.dirname(__FILE__), %w[.. java lib hive jdo2-api-2.3-SNAPSHOT.jar])}",
38
+ "#{File.join(File.dirname(__FILE__), %w[.. java lib hive datanucleus-core-1.1.2-patched.jar])}",
39
+ "#{File.join(File.dirname(__FILE__), %w[.. java lib hive datanucleus-enhancer-1.1.2.jar])}",
40
+ "#{File.join(File.dirname(__FILE__), %w[.. java lib hive datanucleus-rdbms-1.1.2.jar])}",
41
+ "#{File.join(File.dirname(__FILE__), %w[.. java lib pig jsp-2.1-6.1.14.jar])}"
42
+ ]
43
+ unless configs[:hive_config].nil?
44
+ raise "Rename '#{configs[:hive_config]}' to hive-site.xml for hive metastore configuration." unless File.basename(configs[:hive_config]) == "hive-site.xml"
45
+ classpath << File.dirname(configs[:hive_config])
46
+ end
47
+ classpath.join(":").freeze
48
+ end
49
+ end
50
+
51
+ class Pig
52
+ def initialize(configs)
53
+ @configs = configs
54
+ @oink_oink = []
55
+ end
56
+
57
+ def datasets(*datasets)
58
+ datasets.each {|e| @oink_oink << e.to_s}
59
+ end
60
+
61
+ def grunt(oink)
62
+ @oink_oink << oink
63
+ end
64
+
65
+ def fetch(*aliases)
66
+ alias_dump_dir = "/tmp/pigdump/#{Process.pid}_#{Time.now.to_i}"
67
+ aliases = aliases.map {|alias_to_fetch| "#{alias_dump_dir}/#{alias_to_fetch}"}
68
+
69
+ pig_script_path = "/tmp/pigscript/#{Process.pid}_#{Time.now.to_i}"
70
+ FileUtils.mkdir_p(File.dirname(pig_script_path))
71
+ File.open(pig_script_path, "w") do |file|
72
+ @oink_oink.each {|oink| file << "#{oink}\n"}
73
+ aliases.each do |dump_file_path|
74
+ file << "STORE #{File.basename(dump_file_path)} INTO '#{dump_file_path}' USING PigStorage ('\\t');\n"
75
+ end
76
+ end
77
+
78
+ pig_execution = "#{RBPig.executable(@configs)} -f #{pig_script_path} 2>&1"
79
+ pig_out = []
80
+ IO.popen(pig_execution) do |stdout|
81
+ puts pig_execution
82
+ until stdout.eof? do
83
+ pig_out << stdout.gets
84
+ puts pig_out.last
85
+ end
86
+ end
87
+
88
+ if $?.success?
89
+ return *fetch_files_in_hdfs(aliases).map {|lines| lines.map{|e| e.chomp("\n").split("\t", -1)}}
90
+ else
91
+ raise "#{pig_out.join("\n")}Failed executing #{pig_execution}"
92
+ end
93
+ end
94
+
95
+ private
96
+ def fetch_files_in_hdfs(file_paths)
97
+ mandy_config = @configs[:hadoop_config].nil? && "" || "-c #{@configs[:hadoop_config]}"
98
+ return file_paths.map do |file_path|
99
+ FileUtils.remove_file(file_path, true) if File.exists?(file_path)
100
+ `mandy-get #{mandy_config} #{file_path} #{file_path}`
101
+ `mandy-rm #{mandy_config} #{file_path}`
102
+ File.open(file_path) {|file| file.readlines}
103
+ end
104
+ end
105
+ end
106
+ end
metadata ADDED
@@ -0,0 +1,89 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: rbpig
3
+ version: !ruby/object:Gem::Version
4
+ prerelease: false
5
+ segments:
6
+ - 0
7
+ - 1
8
+ - 0
9
+ version: 0.1.0
10
+ platform: ruby
11
+ authors:
12
+ - Jae Lee
13
+ autorequire:
14
+ bindir: bin
15
+ cert_chain: []
16
+
17
+ date: 2010-12-22 00:00:00 +00:00
18
+ default_executable:
19
+ dependencies:
20
+ - !ruby/object:Gem::Dependency
21
+ name: mandy
22
+ prerelease: false
23
+ requirement: &id001 !ruby/object:Gem::Requirement
24
+ none: false
25
+ requirements:
26
+ - - ">="
27
+ - !ruby/object:Gem::Version
28
+ segments:
29
+ - 0
30
+ - 5
31
+ - 0
32
+ version: 0.5.0
33
+ type: :runtime
34
+ version_requirements: *id001
35
+ description: Simple lib for executing Pig queries, supports textfile based hive table loader with automatic schema discovery
36
+ email: jlee@yetitrails.com
37
+ executables:
38
+ - rbpig
39
+ extensions: []
40
+
41
+ extra_rdoc_files: []
42
+
43
+ files:
44
+ - bin/rbpig
45
+ - lib/rbpig.rb
46
+ - lib/rbpig/dataset.rb
47
+ - java/dist/porkchop.jar
48
+ - java/lib/hive/hive-exec-0.5.0+32.jar
49
+ - java/lib/hive/hive-metastore-0.5.0+32.jar
50
+ - java/lib/hive/libfb303.jar
51
+ - java/lib/hive/jdo2-api-2.3-SNAPSHOT.jar
52
+ - java/lib/hive/datanucleus-core-1.1.2-patched.jar
53
+ - java/lib/hive/datanucleus-enhancer-1.1.2.jar
54
+ - java/lib/hive/datanucleus-rdbms-1.1.2.jar
55
+ - java/lib/pig/jsp-2.1-6.1.14.jar
56
+ has_rdoc: true
57
+ homepage: https://github.com/forward/rbpig
58
+ licenses: []
59
+
60
+ post_install_message:
61
+ rdoc_options: []
62
+
63
+ require_paths:
64
+ - lib
65
+ required_ruby_version: !ruby/object:Gem::Requirement
66
+ none: false
67
+ requirements:
68
+ - - ">="
69
+ - !ruby/object:Gem::Version
70
+ segments:
71
+ - 0
72
+ version: "0"
73
+ required_rubygems_version: !ruby/object:Gem::Requirement
74
+ none: false
75
+ requirements:
76
+ - - ">="
77
+ - !ruby/object:Gem::Version
78
+ segments:
79
+ - 0
80
+ version: "0"
81
+ requirements: []
82
+
83
+ rubyforge_project:
84
+ rubygems_version: 1.3.7
85
+ signing_key:
86
+ specification_version: 3
87
+ summary: Pig queries execution ruby binding
88
+ test_files: []
89
+