rbpig 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/bin/rbpig +26 -0
- data/java/dist/porkchop.jar +0 -0
- data/java/lib/hive/datanucleus-core-1.1.2-patched.jar +0 -0
- data/java/lib/hive/datanucleus-enhancer-1.1.2.jar +0 -0
- data/java/lib/hive/datanucleus-rdbms-1.1.2.jar +0 -0
- data/java/lib/hive/hive-exec-0.5.0+32.jar +0 -0
- data/java/lib/hive/hive-metastore-0.5.0+32.jar +0 -0
- data/java/lib/hive/jdo2-api-2.3-SNAPSHOT.jar +0 -0
- data/java/lib/hive/libfb303.jar +0 -0
- data/java/lib/pig/jsp-2.1-6.1.14.jar +0 -0
- data/lib/rbpig/dataset.rb +23 -0
- data/lib/rbpig.rb +106 -0
- metadata +89 -0
data/bin/rbpig
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'rubygems'
|
3
|
+
require 'optparse'
|
4
|
+
require 'ostruct'
|
5
|
+
require 'rbpig'
|
6
|
+
|
7
|
+
options = OpenStruct.new
|
8
|
+
|
9
|
+
OptionParser.new do |opts|
|
10
|
+
opts.banner = "USAGE: rbpig [options]"
|
11
|
+
|
12
|
+
opts.on("-c", "--conf HADOOP_CONF", "Use this hadoop cluster xml config file.") do |config|
|
13
|
+
options.hadoop_config = config
|
14
|
+
end
|
15
|
+
|
16
|
+
opts.on("-m", "--mata-conf HIVE_CONF", "Use this hive metastore xml config file.") do |config|
|
17
|
+
options.hive_config = config
|
18
|
+
end
|
19
|
+
|
20
|
+
opts.on_tail("-h", "--help", "Show this message") do
|
21
|
+
puts opts
|
22
|
+
exit
|
23
|
+
end
|
24
|
+
end.parse!
|
25
|
+
|
26
|
+
exec(RBPig.executable(:hadoop_config => options.hadoop_config, :hive_config => options.hive_config))
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module RBPig
|
2
|
+
class Dataset
|
3
|
+
class << self
|
4
|
+
HIVE_CONFIG = {:database => "default", :database_root => "/user/hive/warehouse"}
|
5
|
+
|
6
|
+
def hive(table_name, storage_type = :text_file, config = {})
|
7
|
+
raise "storage type other than :text_file is not supported." unless storage_type == :text_file
|
8
|
+
|
9
|
+
config = HIVE_CONFIG.merge(:field_separator => "\\t").merge(config)
|
10
|
+
new("#{table_name} = LOAD '#{config[:database_root]}/#{table_name}' USING HiveTableLoader('#{config[:field_separator]}', '#{config[:database]}');")
|
11
|
+
end
|
12
|
+
end
|
13
|
+
|
14
|
+
def to_s
|
15
|
+
@load_script
|
16
|
+
end
|
17
|
+
|
18
|
+
private
|
19
|
+
def initialize(load_script)
|
20
|
+
@load_script = load_script
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
data/lib/rbpig.rb
ADDED
@@ -0,0 +1,106 @@
|
|
1
|
+
require 'fileutils'
|
2
|
+
require 'rexml/document'
|
3
|
+
require File.join(File.dirname(__FILE__), "rbpig", "dataset")
|
4
|
+
|
5
|
+
module RBPig
|
6
|
+
class << self
|
7
|
+
def executable(configs)
|
8
|
+
configs = pig_configs(configs)
|
9
|
+
|
10
|
+
pig_options = ["-Dudf.import.list=forward.pig.storage"]
|
11
|
+
unless configs[:hadoop_config].nil?
|
12
|
+
hadoop_config = {}
|
13
|
+
REXML::Document.new(File.new(configs[:hadoop_config])).elements.each('configuration/property') do |property|
|
14
|
+
hadoop_config[property.elements[1].text] = property.elements[2].text
|
15
|
+
end
|
16
|
+
pig_options << "-Dfs.default.name=#{hadoop_config["fs.default.name"]}" if hadoop_config.has_key?("fs.default.name")
|
17
|
+
pig_options << "-Dmapred.job.tracker=#{hadoop_config["mapred.job.tracker"]}" if hadoop_config.has_key?("mapred.job.tracker")
|
18
|
+
end
|
19
|
+
["PIG_CLASSPATH='#{pig_classpath(configs)}'", "PIG_OPTS='#{pig_options.join(" ")}'", "pig", "-l /tmp"].join(" ")
|
20
|
+
end
|
21
|
+
|
22
|
+
def connect(configs)
|
23
|
+
yield Pig.new(pig_configs(configs))
|
24
|
+
end
|
25
|
+
|
26
|
+
private
|
27
|
+
def pig_configs(configs)
|
28
|
+
{:hadoop_config => nil, :hive_config => nil}.merge(configs || {})
|
29
|
+
end
|
30
|
+
|
31
|
+
def pig_classpath(configs)
|
32
|
+
classpath = [
|
33
|
+
"#{File.join(File.dirname(__FILE__), %w[.. java dist porkchop.jar])}",
|
34
|
+
"#{File.join(File.dirname(__FILE__), %w[.. java lib hive hive-exec-0.5.0+32.jar])}",
|
35
|
+
"#{File.join(File.dirname(__FILE__), %w[.. java lib hive hive-metastore-0.5.0+32.jar])}",
|
36
|
+
"#{File.join(File.dirname(__FILE__), %w[.. java lib hive libfb303.jar])}",
|
37
|
+
"#{File.join(File.dirname(__FILE__), %w[.. java lib hive jdo2-api-2.3-SNAPSHOT.jar])}",
|
38
|
+
"#{File.join(File.dirname(__FILE__), %w[.. java lib hive datanucleus-core-1.1.2-patched.jar])}",
|
39
|
+
"#{File.join(File.dirname(__FILE__), %w[.. java lib hive datanucleus-enhancer-1.1.2.jar])}",
|
40
|
+
"#{File.join(File.dirname(__FILE__), %w[.. java lib hive datanucleus-rdbms-1.1.2.jar])}",
|
41
|
+
"#{File.join(File.dirname(__FILE__), %w[.. java lib pig jsp-2.1-6.1.14.jar])}"
|
42
|
+
]
|
43
|
+
unless configs[:hive_config].nil?
|
44
|
+
raise "Rename '#{configs[:hive_config]}' to hive-site.xml for hive metastore configuration." unless File.basename(configs[:hive_config]) == "hive-site.xml"
|
45
|
+
classpath << File.dirname(configs[:hive_config])
|
46
|
+
end
|
47
|
+
classpath.join(":").freeze
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
class Pig
|
52
|
+
def initialize(configs)
|
53
|
+
@configs = configs
|
54
|
+
@oink_oink = []
|
55
|
+
end
|
56
|
+
|
57
|
+
def datasets(*datasets)
|
58
|
+
datasets.each {|e| @oink_oink << e.to_s}
|
59
|
+
end
|
60
|
+
|
61
|
+
def grunt(oink)
|
62
|
+
@oink_oink << oink
|
63
|
+
end
|
64
|
+
|
65
|
+
def fetch(*aliases)
|
66
|
+
alias_dump_dir = "/tmp/pigdump/#{Process.pid}_#{Time.now.to_i}"
|
67
|
+
aliases = aliases.map {|alias_to_fetch| "#{alias_dump_dir}/#{alias_to_fetch}"}
|
68
|
+
|
69
|
+
pig_script_path = "/tmp/pigscript/#{Process.pid}_#{Time.now.to_i}"
|
70
|
+
FileUtils.mkdir_p(File.dirname(pig_script_path))
|
71
|
+
File.open(pig_script_path, "w") do |file|
|
72
|
+
@oink_oink.each {|oink| file << "#{oink}\n"}
|
73
|
+
aliases.each do |dump_file_path|
|
74
|
+
file << "STORE #{File.basename(dump_file_path)} INTO '#{dump_file_path}' USING PigStorage ('\\t');\n"
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
pig_execution = "#{RBPig.executable(@configs)} -f #{pig_script_path} 2>&1"
|
79
|
+
pig_out = []
|
80
|
+
IO.popen(pig_execution) do |stdout|
|
81
|
+
puts pig_execution
|
82
|
+
until stdout.eof? do
|
83
|
+
pig_out << stdout.gets
|
84
|
+
puts pig_out.last
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
if $?.success?
|
89
|
+
return *fetch_files_in_hdfs(aliases).map {|lines| lines.map{|e| e.chomp("\n").split("\t", -1)}}
|
90
|
+
else
|
91
|
+
raise "#{pig_out.join("\n")}Failed executing #{pig_execution}"
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
private
|
96
|
+
def fetch_files_in_hdfs(file_paths)
|
97
|
+
mandy_config = @configs[:hadoop_config].nil? && "" || "-c #{@configs[:hadoop_config]}"
|
98
|
+
return file_paths.map do |file_path|
|
99
|
+
FileUtils.remove_file(file_path, true) if File.exists?(file_path)
|
100
|
+
`mandy-get #{mandy_config} #{file_path} #{file_path}`
|
101
|
+
`mandy-rm #{mandy_config} #{file_path}`
|
102
|
+
File.open(file_path) {|file| file.readlines}
|
103
|
+
end
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
metadata
ADDED
@@ -0,0 +1,89 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: rbpig
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease: false
|
5
|
+
segments:
|
6
|
+
- 0
|
7
|
+
- 1
|
8
|
+
- 0
|
9
|
+
version: 0.1.0
|
10
|
+
platform: ruby
|
11
|
+
authors:
|
12
|
+
- Jae Lee
|
13
|
+
autorequire:
|
14
|
+
bindir: bin
|
15
|
+
cert_chain: []
|
16
|
+
|
17
|
+
date: 2010-12-22 00:00:00 +00:00
|
18
|
+
default_executable:
|
19
|
+
dependencies:
|
20
|
+
- !ruby/object:Gem::Dependency
|
21
|
+
name: mandy
|
22
|
+
prerelease: false
|
23
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
24
|
+
none: false
|
25
|
+
requirements:
|
26
|
+
- - ">="
|
27
|
+
- !ruby/object:Gem::Version
|
28
|
+
segments:
|
29
|
+
- 0
|
30
|
+
- 5
|
31
|
+
- 0
|
32
|
+
version: 0.5.0
|
33
|
+
type: :runtime
|
34
|
+
version_requirements: *id001
|
35
|
+
description: Simple lib for executing Pig queries, supports textfile based hive table loader with automatic schema discovery
|
36
|
+
email: jlee@yetitrails.com
|
37
|
+
executables:
|
38
|
+
- rbpig
|
39
|
+
extensions: []
|
40
|
+
|
41
|
+
extra_rdoc_files: []
|
42
|
+
|
43
|
+
files:
|
44
|
+
- bin/rbpig
|
45
|
+
- lib/rbpig.rb
|
46
|
+
- lib/rbpig/dataset.rb
|
47
|
+
- java/dist/porkchop.jar
|
48
|
+
- java/lib/hive/hive-exec-0.5.0+32.jar
|
49
|
+
- java/lib/hive/hive-metastore-0.5.0+32.jar
|
50
|
+
- java/lib/hive/libfb303.jar
|
51
|
+
- java/lib/hive/jdo2-api-2.3-SNAPSHOT.jar
|
52
|
+
- java/lib/hive/datanucleus-core-1.1.2-patched.jar
|
53
|
+
- java/lib/hive/datanucleus-enhancer-1.1.2.jar
|
54
|
+
- java/lib/hive/datanucleus-rdbms-1.1.2.jar
|
55
|
+
- java/lib/pig/jsp-2.1-6.1.14.jar
|
56
|
+
has_rdoc: true
|
57
|
+
homepage: https://github.com/forward/rbpig
|
58
|
+
licenses: []
|
59
|
+
|
60
|
+
post_install_message:
|
61
|
+
rdoc_options: []
|
62
|
+
|
63
|
+
require_paths:
|
64
|
+
- lib
|
65
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
66
|
+
none: false
|
67
|
+
requirements:
|
68
|
+
- - ">="
|
69
|
+
- !ruby/object:Gem::Version
|
70
|
+
segments:
|
71
|
+
- 0
|
72
|
+
version: "0"
|
73
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
74
|
+
none: false
|
75
|
+
requirements:
|
76
|
+
- - ">="
|
77
|
+
- !ruby/object:Gem::Version
|
78
|
+
segments:
|
79
|
+
- 0
|
80
|
+
version: "0"
|
81
|
+
requirements: []
|
82
|
+
|
83
|
+
rubyforge_project:
|
84
|
+
rubygems_version: 1.3.7
|
85
|
+
signing_key:
|
86
|
+
specification_version: 3
|
87
|
+
summary: Pig queries execution ruby binding
|
88
|
+
test_files: []
|
89
|
+
|