rbpig 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/bin/rbpig +26 -0
- data/java/dist/porkchop.jar +0 -0
- data/java/lib/hive/datanucleus-core-1.1.2-patched.jar +0 -0
- data/java/lib/hive/datanucleus-enhancer-1.1.2.jar +0 -0
- data/java/lib/hive/datanucleus-rdbms-1.1.2.jar +0 -0
- data/java/lib/hive/hive-exec-0.5.0+32.jar +0 -0
- data/java/lib/hive/hive-metastore-0.5.0+32.jar +0 -0
- data/java/lib/hive/jdo2-api-2.3-SNAPSHOT.jar +0 -0
- data/java/lib/hive/libfb303.jar +0 -0
- data/java/lib/pig/jsp-2.1-6.1.14.jar +0 -0
- data/lib/rbpig/dataset.rb +23 -0
- data/lib/rbpig.rb +106 -0
- metadata +89 -0
data/bin/rbpig
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'rubygems'
|
3
|
+
require 'optparse'
|
4
|
+
require 'ostruct'
|
5
|
+
require 'rbpig'
|
6
|
+
|
7
|
+
options = OpenStruct.new
|
8
|
+
|
9
|
+
OptionParser.new do |opts|
|
10
|
+
opts.banner = "USAGE: rbpig [options]"
|
11
|
+
|
12
|
+
opts.on("-c", "--conf HADOOP_CONF", "Use this hadoop cluster xml config file.") do |config|
|
13
|
+
options.hadoop_config = config
|
14
|
+
end
|
15
|
+
|
16
|
+
opts.on("-m", "--mata-conf HIVE_CONF", "Use this hive metastore xml config file.") do |config|
|
17
|
+
options.hive_config = config
|
18
|
+
end
|
19
|
+
|
20
|
+
opts.on_tail("-h", "--help", "Show this message") do
|
21
|
+
puts opts
|
22
|
+
exit
|
23
|
+
end
|
24
|
+
end.parse!
|
25
|
+
|
26
|
+
exec(RBPig.executable(:hadoop_config => options.hadoop_config, :hive_config => options.hive_config))
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module RBPig
|
2
|
+
class Dataset
|
3
|
+
class << self
|
4
|
+
HIVE_CONFIG = {:database => "default", :database_root => "/user/hive/warehouse"}
|
5
|
+
|
6
|
+
def hive(table_name, storage_type = :text_file, config = {})
|
7
|
+
raise "storage type other than :text_file is not supported." unless storage_type == :text_file
|
8
|
+
|
9
|
+
config = HIVE_CONFIG.merge(:field_separator => "\\t").merge(config)
|
10
|
+
new("#{table_name} = LOAD '#{config[:database_root]}/#{table_name}' USING HiveTableLoader('#{config[:field_separator]}', '#{config[:database]}');")
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
def to_s
|
15
|
+
@load_script
|
16
|
+
end
|
17
|
+
|
18
|
+
private
|
19
|
+
def initialize(load_script)
|
20
|
+
@load_script = load_script
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
data/lib/rbpig.rb
ADDED
@@ -0,0 +1,106 @@
|
|
1
|
+
require 'fileutils'
|
2
|
+
require 'rexml/document'
|
3
|
+
require File.join(File.dirname(__FILE__), "rbpig", "dataset")
|
4
|
+
|
5
|
+
module RBPig
|
6
|
+
class << self
|
7
|
+
def executable(configs)
|
8
|
+
configs = pig_configs(configs)
|
9
|
+
|
10
|
+
pig_options = ["-Dudf.import.list=forward.pig.storage"]
|
11
|
+
unless configs[:hadoop_config].nil?
|
12
|
+
hadoop_config = {}
|
13
|
+
REXML::Document.new(File.new(configs[:hadoop_config])).elements.each('configuration/property') do |property|
|
14
|
+
hadoop_config[property.elements[1].text] = property.elements[2].text
|
15
|
+
end
|
16
|
+
pig_options << "-Dfs.default.name=#{hadoop_config["fs.default.name"]}" if hadoop_config.has_key?("fs.default.name")
|
17
|
+
pig_options << "-Dmapred.job.tracker=#{hadoop_config["mapred.job.tracker"]}" if hadoop_config.has_key?("mapred.job.tracker")
|
18
|
+
end
|
19
|
+
["PIG_CLASSPATH='#{pig_classpath(configs)}'", "PIG_OPTS='#{pig_options.join(" ")}'", "pig", "-l /tmp"].join(" ")
|
20
|
+
end
|
21
|
+
|
22
|
+
def connect(configs)
|
23
|
+
yield Pig.new(pig_configs(configs))
|
24
|
+
end
|
25
|
+
|
26
|
+
private
|
27
|
+
def pig_configs(configs)
|
28
|
+
{:hadoop_config => nil, :hive_config => nil}.merge(configs || {})
|
29
|
+
end
|
30
|
+
|
31
|
+
def pig_classpath(configs)
|
32
|
+
classpath = [
|
33
|
+
"#{File.join(File.dirname(__FILE__), %w[.. java dist porkchop.jar])}",
|
34
|
+
"#{File.join(File.dirname(__FILE__), %w[.. java lib hive hive-exec-0.5.0+32.jar])}",
|
35
|
+
"#{File.join(File.dirname(__FILE__), %w[.. java lib hive hive-metastore-0.5.0+32.jar])}",
|
36
|
+
"#{File.join(File.dirname(__FILE__), %w[.. java lib hive libfb303.jar])}",
|
37
|
+
"#{File.join(File.dirname(__FILE__), %w[.. java lib hive jdo2-api-2.3-SNAPSHOT.jar])}",
|
38
|
+
"#{File.join(File.dirname(__FILE__), %w[.. java lib hive datanucleus-core-1.1.2-patched.jar])}",
|
39
|
+
"#{File.join(File.dirname(__FILE__), %w[.. java lib hive datanucleus-enhancer-1.1.2.jar])}",
|
40
|
+
"#{File.join(File.dirname(__FILE__), %w[.. java lib hive datanucleus-rdbms-1.1.2.jar])}",
|
41
|
+
"#{File.join(File.dirname(__FILE__), %w[.. java lib pig jsp-2.1-6.1.14.jar])}"
|
42
|
+
]
|
43
|
+
unless configs[:hive_config].nil?
|
44
|
+
raise "Rename '#{configs[:hive_config]}' to hive-site.xml for hive metastore configuration." unless File.basename(configs[:hive_config]) == "hive-site.xml"
|
45
|
+
classpath << File.dirname(configs[:hive_config])
|
46
|
+
end
|
47
|
+
classpath.join(":").freeze
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
class Pig
|
52
|
+
def initialize(configs)
|
53
|
+
@configs = configs
|
54
|
+
@oink_oink = []
|
55
|
+
end
|
56
|
+
|
57
|
+
def datasets(*datasets)
|
58
|
+
datasets.each {|e| @oink_oink << e.to_s}
|
59
|
+
end
|
60
|
+
|
61
|
+
def grunt(oink)
|
62
|
+
@oink_oink << oink
|
63
|
+
end
|
64
|
+
|
65
|
+
def fetch(*aliases)
|
66
|
+
alias_dump_dir = "/tmp/pigdump/#{Process.pid}_#{Time.now.to_i}"
|
67
|
+
aliases = aliases.map {|alias_to_fetch| "#{alias_dump_dir}/#{alias_to_fetch}"}
|
68
|
+
|
69
|
+
pig_script_path = "/tmp/pigscript/#{Process.pid}_#{Time.now.to_i}"
|
70
|
+
FileUtils.mkdir_p(File.dirname(pig_script_path))
|
71
|
+
File.open(pig_script_path, "w") do |file|
|
72
|
+
@oink_oink.each {|oink| file << "#{oink}\n"}
|
73
|
+
aliases.each do |dump_file_path|
|
74
|
+
file << "STORE #{File.basename(dump_file_path)} INTO '#{dump_file_path}' USING PigStorage ('\\t');\n"
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
pig_execution = "#{RBPig.executable(@configs)} -f #{pig_script_path} 2>&1"
|
79
|
+
pig_out = []
|
80
|
+
IO.popen(pig_execution) do |stdout|
|
81
|
+
puts pig_execution
|
82
|
+
until stdout.eof? do
|
83
|
+
pig_out << stdout.gets
|
84
|
+
puts pig_out.last
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
if $?.success?
|
89
|
+
return *fetch_files_in_hdfs(aliases).map {|lines| lines.map{|e| e.chomp("\n").split("\t", -1)}}
|
90
|
+
else
|
91
|
+
raise "#{pig_out.join("\n")}Failed executing #{pig_execution}"
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
private
|
96
|
+
def fetch_files_in_hdfs(file_paths)
|
97
|
+
mandy_config = @configs[:hadoop_config].nil? && "" || "-c #{@configs[:hadoop_config]}"
|
98
|
+
return file_paths.map do |file_path|
|
99
|
+
FileUtils.remove_file(file_path, true) if File.exists?(file_path)
|
100
|
+
`mandy-get #{mandy_config} #{file_path} #{file_path}`
|
101
|
+
`mandy-rm #{mandy_config} #{file_path}`
|
102
|
+
File.open(file_path) {|file| file.readlines}
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
metadata
ADDED
@@ -0,0 +1,89 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: rbpig
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease: false
|
5
|
+
segments:
|
6
|
+
- 0
|
7
|
+
- 1
|
8
|
+
- 0
|
9
|
+
version: 0.1.0
|
10
|
+
platform: ruby
|
11
|
+
authors:
|
12
|
+
- Jae Lee
|
13
|
+
autorequire:
|
14
|
+
bindir: bin
|
15
|
+
cert_chain: []
|
16
|
+
|
17
|
+
date: 2010-12-22 00:00:00 +00:00
|
18
|
+
default_executable:
|
19
|
+
dependencies:
|
20
|
+
- !ruby/object:Gem::Dependency
|
21
|
+
name: mandy
|
22
|
+
prerelease: false
|
23
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
24
|
+
none: false
|
25
|
+
requirements:
|
26
|
+
- - ">="
|
27
|
+
- !ruby/object:Gem::Version
|
28
|
+
segments:
|
29
|
+
- 0
|
30
|
+
- 5
|
31
|
+
- 0
|
32
|
+
version: 0.5.0
|
33
|
+
type: :runtime
|
34
|
+
version_requirements: *id001
|
35
|
+
description: Simple lib for executing Pig queries, supports textfile based hive table loader with automatic schema discovery
|
36
|
+
email: jlee@yetitrails.com
|
37
|
+
executables:
|
38
|
+
- rbpig
|
39
|
+
extensions: []
|
40
|
+
|
41
|
+
extra_rdoc_files: []
|
42
|
+
|
43
|
+
files:
|
44
|
+
- bin/rbpig
|
45
|
+
- lib/rbpig.rb
|
46
|
+
- lib/rbpig/dataset.rb
|
47
|
+
- java/dist/porkchop.jar
|
48
|
+
- java/lib/hive/hive-exec-0.5.0+32.jar
|
49
|
+
- java/lib/hive/hive-metastore-0.5.0+32.jar
|
50
|
+
- java/lib/hive/libfb303.jar
|
51
|
+
- java/lib/hive/jdo2-api-2.3-SNAPSHOT.jar
|
52
|
+
- java/lib/hive/datanucleus-core-1.1.2-patched.jar
|
53
|
+
- java/lib/hive/datanucleus-enhancer-1.1.2.jar
|
54
|
+
- java/lib/hive/datanucleus-rdbms-1.1.2.jar
|
55
|
+
- java/lib/pig/jsp-2.1-6.1.14.jar
|
56
|
+
has_rdoc: true
|
57
|
+
homepage: https://github.com/forward/rbpig
|
58
|
+
licenses: []
|
59
|
+
|
60
|
+
post_install_message:
|
61
|
+
rdoc_options: []
|
62
|
+
|
63
|
+
require_paths:
|
64
|
+
- lib
|
65
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
66
|
+
none: false
|
67
|
+
requirements:
|
68
|
+
- - ">="
|
69
|
+
- !ruby/object:Gem::Version
|
70
|
+
segments:
|
71
|
+
- 0
|
72
|
+
version: "0"
|
73
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
74
|
+
none: false
|
75
|
+
requirements:
|
76
|
+
- - ">="
|
77
|
+
- !ruby/object:Gem::Version
|
78
|
+
segments:
|
79
|
+
- 0
|
80
|
+
version: "0"
|
81
|
+
requirements: []
|
82
|
+
|
83
|
+
rubyforge_project:
|
84
|
+
rubygems_version: 1.3.7
|
85
|
+
signing_key:
|
86
|
+
specification_version: 3
|
87
|
+
summary: Pig queries execution ruby binding
|
88
|
+
test_files: []
|
89
|
+
|