rbpig 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/bin/rbpig ADDED
@@ -0,0 +1,26 @@
1
+ #!/usr/bin/env ruby
2
+ require 'rubygems'
3
+ require 'optparse'
4
+ require 'ostruct'
5
+ require 'rbpig'
6
+
7
+ options = OpenStruct.new
8
+
9
+ OptionParser.new do |opts|
10
+ opts.banner = "USAGE: rbpig [options]"
11
+
12
+ opts.on("-c", "--conf HADOOP_CONF", "Use this hadoop cluster xml config file.") do |config|
13
+ options.hadoop_config = config
14
+ end
15
+
16
+ opts.on("-m", "--mata-conf HIVE_CONF", "Use this hive metastore xml config file.") do |config|
17
+ options.hive_config = config
18
+ end
19
+
20
+ opts.on_tail("-h", "--help", "Show this message") do
21
+ puts opts
22
+ exit
23
+ end
24
+ end.parse!
25
+
26
+ exec(RBPig.executable(:hadoop_config => options.hadoop_config, :hive_config => options.hive_config))
Binary file
Binary file
Binary file
@@ -0,0 +1,23 @@
1
+ module RBPig
2
+ class Dataset
3
+ class << self
4
+ HIVE_CONFIG = {:database => "default", :database_root => "/user/hive/warehouse"}
5
+
6
+ def hive(table_name, storage_type = :text_file, config = {})
7
+ raise "storage type other than :text_file is not supported." unless storage_type == :text_file
8
+
9
+ config = HIVE_CONFIG.merge(:field_separator => "\\t").merge(config)
10
+ new("#{table_name} = LOAD '#{config[:database_root]}/#{table_name}' USING HiveTableLoader('#{config[:field_separator]}', '#{config[:database]}');")
11
+ end
12
+ end
13
+
14
+ def to_s
15
+ @load_script
16
+ end
17
+
18
+ private
19
+ def initialize(load_script)
20
+ @load_script = load_script
21
+ end
22
+ end
23
+ end
data/lib/rbpig.rb ADDED
@@ -0,0 +1,106 @@
1
+ require 'fileutils'
2
+ require 'rexml/document'
3
+ require File.join(File.dirname(__FILE__), "rbpig", "dataset")
4
+
5
+ module RBPig
6
+ class << self
7
+ def executable(configs)
8
+ configs = pig_configs(configs)
9
+
10
+ pig_options = ["-Dudf.import.list=forward.pig.storage"]
11
+ unless configs[:hadoop_config].nil?
12
+ hadoop_config = {}
13
+ REXML::Document.new(File.new(configs[:hadoop_config])).elements.each('configuration/property') do |property|
14
+ hadoop_config[property.elements[1].text] = property.elements[2].text
15
+ end
16
+ pig_options << "-Dfs.default.name=#{hadoop_config["fs.default.name"]}" if hadoop_config.has_key?("fs.default.name")
17
+ pig_options << "-Dmapred.job.tracker=#{hadoop_config["mapred.job.tracker"]}" if hadoop_config.has_key?("mapred.job.tracker")
18
+ end
19
+ ["PIG_CLASSPATH='#{pig_classpath(configs)}'", "PIG_OPTS='#{pig_options.join(" ")}'", "pig", "-l /tmp"].join(" ")
20
+ end
21
+
22
+ def connect(configs)
23
+ yield Pig.new(pig_configs(configs))
24
+ end
25
+
26
+ private
27
+ def pig_configs(configs)
28
+ {:hadoop_config => nil, :hive_config => nil}.merge(configs || {})
29
+ end
30
+
31
+ def pig_classpath(configs)
32
+ classpath = [
33
+ "#{File.join(File.dirname(__FILE__), %w[.. java dist porkchop.jar])}",
34
+ "#{File.join(File.dirname(__FILE__), %w[.. java lib hive hive-exec-0.5.0+32.jar])}",
35
+ "#{File.join(File.dirname(__FILE__), %w[.. java lib hive hive-metastore-0.5.0+32.jar])}",
36
+ "#{File.join(File.dirname(__FILE__), %w[.. java lib hive libfb303.jar])}",
37
+ "#{File.join(File.dirname(__FILE__), %w[.. java lib hive jdo2-api-2.3-SNAPSHOT.jar])}",
38
+ "#{File.join(File.dirname(__FILE__), %w[.. java lib hive datanucleus-core-1.1.2-patched.jar])}",
39
+ "#{File.join(File.dirname(__FILE__), %w[.. java lib hive datanucleus-enhancer-1.1.2.jar])}",
40
+ "#{File.join(File.dirname(__FILE__), %w[.. java lib hive datanucleus-rdbms-1.1.2.jar])}",
41
+ "#{File.join(File.dirname(__FILE__), %w[.. java lib pig jsp-2.1-6.1.14.jar])}"
42
+ ]
43
+ unless configs[:hive_config].nil?
44
+ raise "Rename '#{configs[:hive_config]}' to hive-site.xml for hive metastore configuration." unless File.basename(configs[:hive_config]) == "hive-site.xml"
45
+ classpath << File.dirname(configs[:hive_config])
46
+ end
47
+ classpath.join(":").freeze
48
+ end
49
+ end
50
+
51
+ class Pig
52
+ def initialize(configs)
53
+ @configs = configs
54
+ @oink_oink = []
55
+ end
56
+
57
+ def datasets(*datasets)
58
+ datasets.each {|e| @oink_oink << e.to_s}
59
+ end
60
+
61
+ def grunt(oink)
62
+ @oink_oink << oink
63
+ end
64
+
65
+ def fetch(*aliases)
66
+ alias_dump_dir = "/tmp/pigdump/#{Process.pid}_#{Time.now.to_i}"
67
+ aliases = aliases.map {|alias_to_fetch| "#{alias_dump_dir}/#{alias_to_fetch}"}
68
+
69
+ pig_script_path = "/tmp/pigscript/#{Process.pid}_#{Time.now.to_i}"
70
+ FileUtils.mkdir_p(File.dirname(pig_script_path))
71
+ File.open(pig_script_path, "w") do |file|
72
+ @oink_oink.each {|oink| file << "#{oink}\n"}
73
+ aliases.each do |dump_file_path|
74
+ file << "STORE #{File.basename(dump_file_path)} INTO '#{dump_file_path}' USING PigStorage ('\\t');\n"
75
+ end
76
+ end
77
+
78
+ pig_execution = "#{RBPig.executable(@configs)} -f #{pig_script_path} 2>&1"
79
+ pig_out = []
80
+ IO.popen(pig_execution) do |stdout|
81
+ puts pig_execution
82
+ until stdout.eof? do
83
+ pig_out << stdout.gets
84
+ puts pig_out.last
85
+ end
86
+ end
87
+
88
+ if $?.success?
89
+ return *fetch_files_in_hdfs(aliases).map {|lines| lines.map{|e| e.chomp("\n").split("\t", -1)}}
90
+ else
91
+ raise "#{pig_out.join("\n")}Failed executing #{pig_execution}"
92
+ end
93
+ end
94
+
95
+ private
96
+ def fetch_files_in_hdfs(file_paths)
97
+ mandy_config = @configs[:hadoop_config].nil? && "" || "-c #{@configs[:hadoop_config]}"
98
+ return file_paths.map do |file_path|
99
+ FileUtils.remove_file(file_path, true) if File.exists?(file_path)
100
+ `mandy-get #{mandy_config} #{file_path} #{file_path}`
101
+ `mandy-rm #{mandy_config} #{file_path}`
102
+ File.open(file_path) {|file| file.readlines}
103
+ end
104
+ end
105
+ end
106
+ end
metadata ADDED
@@ -0,0 +1,89 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: rbpig
3
+ version: !ruby/object:Gem::Version
4
+ prerelease: false
5
+ segments:
6
+ - 0
7
+ - 1
8
+ - 0
9
+ version: 0.1.0
10
+ platform: ruby
11
+ authors:
12
+ - Jae Lee
13
+ autorequire:
14
+ bindir: bin
15
+ cert_chain: []
16
+
17
+ date: 2010-12-22 00:00:00 +00:00
18
+ default_executable:
19
+ dependencies:
20
+ - !ruby/object:Gem::Dependency
21
+ name: mandy
22
+ prerelease: false
23
+ requirement: &id001 !ruby/object:Gem::Requirement
24
+ none: false
25
+ requirements:
26
+ - - ">="
27
+ - !ruby/object:Gem::Version
28
+ segments:
29
+ - 0
30
+ - 5
31
+ - 0
32
+ version: 0.5.0
33
+ type: :runtime
34
+ version_requirements: *id001
35
+ description: Simple lib for executing Pig queries, supports textfile based hive table loader with automatic schema discovery
36
+ email: jlee@yetitrails.com
37
+ executables:
38
+ - rbpig
39
+ extensions: []
40
+
41
+ extra_rdoc_files: []
42
+
43
+ files:
44
+ - bin/rbpig
45
+ - lib/rbpig.rb
46
+ - lib/rbpig/dataset.rb
47
+ - java/dist/porkchop.jar
48
+ - java/lib/hive/hive-exec-0.5.0+32.jar
49
+ - java/lib/hive/hive-metastore-0.5.0+32.jar
50
+ - java/lib/hive/libfb303.jar
51
+ - java/lib/hive/jdo2-api-2.3-SNAPSHOT.jar
52
+ - java/lib/hive/datanucleus-core-1.1.2-patched.jar
53
+ - java/lib/hive/datanucleus-enhancer-1.1.2.jar
54
+ - java/lib/hive/datanucleus-rdbms-1.1.2.jar
55
+ - java/lib/pig/jsp-2.1-6.1.14.jar
56
+ has_rdoc: true
57
+ homepage: https://github.com/forward/rbpig
58
+ licenses: []
59
+
60
+ post_install_message:
61
+ rdoc_options: []
62
+
63
+ require_paths:
64
+ - lib
65
+ required_ruby_version: !ruby/object:Gem::Requirement
66
+ none: false
67
+ requirements:
68
+ - - ">="
69
+ - !ruby/object:Gem::Version
70
+ segments:
71
+ - 0
72
+ version: "0"
73
+ required_rubygems_version: !ruby/object:Gem::Requirement
74
+ none: false
75
+ requirements:
76
+ - - ">="
77
+ - !ruby/object:Gem::Version
78
+ segments:
79
+ - 0
80
+ version: "0"
81
+ requirements: []
82
+
83
+ rubyforge_project:
84
+ rubygems_version: 1.3.7
85
+ signing_key:
86
+ specification_version: 3
87
+ summary: Pig queries execution ruby binding
88
+ test_files: []
89
+