nosqoop4u 0.0.1-java
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +2 -0
- data/README +40 -0
- data/bin/nosqoop4u +33 -0
- data/bin/nosqoop4u.rb +220 -0
- metadata +78 -0
data/CHANGELOG
ADDED
data/README
ADDED
@@ -0,0 +1,40 @@
|
|
1
|
+
nosqoop4u
|
2
|
+
|
3
|
+
A sqoop-like jruby/jdbc query application that does not run via map/reduce.
|
4
|
+
It supports direct output to HDFS and unix filesystems as well as STDOUT.
|
5
|
+
|
6
|
+
Requires jruby 1.6+.
|
7
|
+
|
8
|
+
- Why write this? What's wrong with sqoop?
|
9
|
+
|
10
|
+
Nothing is wrong with sqoop. It just doesn't do quite what I need and
|
11
|
+
it's not so straightforward to debug when something goes wrong. In
|
12
|
+
addition, my Hadoop cluster does not route beyond an access layer which
|
13
|
+
means I need to create individual db-specific firewall rules in order to
|
14
|
+
use sqoop.
|
15
|
+
|
16
|
+
Also, I wanted a nice opportunity to play with jruby's java interop
|
17
|
+
and here I get to use both the JDBC and Hadoop HDFS APIs. On a side
|
18
|
+
note, I'm blown away by jruby 1.6.2...stellar performance and an
|
19
|
+
elegant seamless integration with java.
|
20
|
+
|
21
|
+
# installation
|
22
|
+
|
23
|
+
jgem install --no-wrapper nosqoop4u
|
24
|
+
|
25
|
+
Or simply copy nosqoop4u.rb and nosqoop4u into your path.
|
26
|
+
|
27
|
+
# usage
|
28
|
+
|
29
|
+
usage: nosqoop4u options
|
30
|
+
-o, --output # output file (hdfs://, file://, - for stdout)
|
31
|
+
-c, --connect url # jdbc connection url (env NS4U_URL)
|
32
|
+
-u, --user # db username (env NS4U_USER)
|
33
|
+
-p, --pass # db password (env NS4U_PASS)
|
34
|
+
-e, --query # sql query to run
|
35
|
+
-F, --delim # delimiter (default: ^A)
|
36
|
+
-h, --help
|
37
|
+
|
38
|
+
Please let me know if you find this software useful!
|
39
|
+
|
40
|
+
--frank
|
data/bin/nosqoop4u
ADDED
@@ -0,0 +1,33 @@
|
|
1
|
+
#!/bin/bash
|
2
|
+
|
3
|
+
if [ -z "$HADOOP_HOME" ]; then
|
4
|
+
echo error: HADOOP_HOME is not defined >&2
|
5
|
+
exit 1
|
6
|
+
fi
|
7
|
+
|
8
|
+
export PATH=/usr/local/jruby/bin:$PATH
|
9
|
+
if ! type -p jruby >&/dev/null; then
|
10
|
+
echo error: cannot find jruby...please set your PATH >&2
|
11
|
+
exit 1
|
12
|
+
fi
|
13
|
+
export JRUBY_OPTS="--1.9 --fast --server"
|
14
|
+
|
15
|
+
NS4U=${0%/*}/nosqoop4u.rb
|
16
|
+
if [ ! -f $NS4U ]; then
|
17
|
+
echo error: cannot find nosqoop4u.rb...please install it alongside: >&2
|
18
|
+
echo " $0" >&2
|
19
|
+
exit 2
|
20
|
+
fi
|
21
|
+
|
22
|
+
# bring in the hadoop/jdbc jars
|
23
|
+
for f in $HADOOP_HOME/hadoop-core-*.jar; do
|
24
|
+
CLASSPATH=${CLASSPATH}:$f
|
25
|
+
done
|
26
|
+
|
27
|
+
for f in $HADOOP_HOME/lib/*.jar; do
|
28
|
+
CLASSPATH=${CLASSPATH}:$f
|
29
|
+
done
|
30
|
+
|
31
|
+
export CLASSPATH
|
32
|
+
|
33
|
+
exec jruby $NS4U "$@"
|
data/bin/nosqoop4u.rb
ADDED
@@ -0,0 +1,220 @@
|
|
1
|
+
#!/usr/bin/env jruby
|
2
|
+
|
3
|
+
require 'java'
|
4
|
+
require 'rubygems'
|
5
|
+
require 'getoptlong'
|
6
|
+
|
7
|
+
def usage
|
8
|
+
puts <<-EOF
|
9
|
+
usage: nosqoop4u options
|
10
|
+
-o, --output # output file (hdfs://, file://, - for stdout)
|
11
|
+
-c, --connect url # jdbc connection url (env NS4U_URL)
|
12
|
+
-u, --user # db username (env NS4U_USER)
|
13
|
+
-p, --pass # db password (env NS4U_PASS)
|
14
|
+
-e, --query # sql query to run
|
15
|
+
-F, --delim # delimiter (default: ^A)
|
16
|
+
-h, --help
|
17
|
+
EOF
|
18
|
+
end
|
19
|
+
|
20
|
+
class NoSqoop
|
21
|
+
def initialize cfg
|
22
|
+
@db_user = cfg[:db_user] || ENV['NS4U_USER']
|
23
|
+
@db_pass = cfg[:db_pass] || ENV['NS4U_PASS']
|
24
|
+
@db_url = cfg[:db_url] || ENV['NS4U_URL']
|
25
|
+
@db_host = cfg[:db_host] || ENV['NS4U_HOST']
|
26
|
+
@db_name = cfg[:db_name] || ENV['NS4U_DB']
|
27
|
+
|
28
|
+
load_driver
|
29
|
+
connect
|
30
|
+
hack_jdbc
|
31
|
+
end
|
32
|
+
|
33
|
+
def hack_jdbc
|
34
|
+
case @db_url
|
35
|
+
when /jdbc:mysql:/
|
36
|
+
# by default, the mysql jdbc driver will read the entire table
|
37
|
+
# into memory ... this will change to only one row at a time
|
38
|
+
@stmt = @conn.create_statement java.sql.ResultSet.TYPE_FORWARD_ONLY,
|
39
|
+
java.sql.ResultSet.CONCUR_READ_ONLY
|
40
|
+
@stmt.fetch_size = java.lang.Integer.const_get 'MIN_VALUE'
|
41
|
+
when /jdbc:postgresql:/
|
42
|
+
@conn.set_auto_commit false
|
43
|
+
@stmt = @conn.create_statement
|
44
|
+
@stmt.fetch_size = 50
|
45
|
+
else
|
46
|
+
@stmt = @conn.create_statement
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
def load_driver
|
51
|
+
case @db_url
|
52
|
+
when /jdbc:mysql:/
|
53
|
+
Java::com.mysql.jdbc.Driver
|
54
|
+
# handle 0000-00-00 timestamps without an exception, lulz
|
55
|
+
#@db_url << '?zeroDateTimeBehavior=round' if @db_url !~
|
56
|
+
@db_url << '?zeroDateTimeBehavior=convertToNull' if @db_url !~
|
57
|
+
/zeroDateTimeBehavior/
|
58
|
+
when /jdbc:oracle:/
|
59
|
+
Java::oracle.jdbc.OracleDriver
|
60
|
+
when /jdbc:postgresql:/
|
61
|
+
Java::org.postgresql.Driver
|
62
|
+
else
|
63
|
+
raise "error: unknown database type"
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
def connect
|
68
|
+
@conn = java.sql.DriverManager.get_connection(@db_url, @db_user, @db_pass)
|
69
|
+
end
|
70
|
+
|
71
|
+
def table_info r
|
72
|
+
meta = r.meta_data
|
73
|
+
cols = meta.column_count
|
74
|
+
colnames = []
|
75
|
+
cols.times do |i|
|
76
|
+
colnames[i] = meta.column_name(i+1)
|
77
|
+
end
|
78
|
+
|
79
|
+
{:cols => cols, :colnames => colnames}
|
80
|
+
end
|
81
|
+
private :table_info
|
82
|
+
|
83
|
+
def query_jdbc sql, opts = {}
|
84
|
+
delim = opts[:delim] || "\001"
|
85
|
+
|
86
|
+
res = @stmt.execute_query sql
|
87
|
+
tbl = table_info res
|
88
|
+
|
89
|
+
while res.next do
|
90
|
+
s = ''
|
91
|
+
1.upto(tbl[:cols]) do |i|
|
92
|
+
data = res.get_string i
|
93
|
+
s << delim if i > 0
|
94
|
+
s << data if data
|
95
|
+
end
|
96
|
+
yield s
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
def query_cmd sql, opts = {}
|
101
|
+
cmd = %Q|PGPASSWORD=#{@db_pass} psql -t -A -F "#{@delim}" -c '#{sql}' | +
|
102
|
+
%Q|-h #{@db_host} -U #{@db_user} #{@db_name}|
|
103
|
+
p cmd
|
104
|
+
STDOUT.sync = true
|
105
|
+
IO.popen(cmd).each_line {|line| yield line}
|
106
|
+
end
|
107
|
+
|
108
|
+
def query sql, opts = {}
|
109
|
+
output = opts[:output] || STDOUT
|
110
|
+
recs = 0
|
111
|
+
bytes = 0
|
112
|
+
|
113
|
+
if opts[:query_type].to_s == 'cmd'
|
114
|
+
q = method :query_cmd
|
115
|
+
else
|
116
|
+
q = method :query_jdbc
|
117
|
+
end
|
118
|
+
|
119
|
+
begin_ts = Time.now
|
120
|
+
|
121
|
+
q.call(sql, opts) do |s|
|
122
|
+
output.puts s
|
123
|
+
bytes += s.length
|
124
|
+
recs +=1
|
125
|
+
if recs % 100000 == 0
|
126
|
+
end_ts = Time.now
|
127
|
+
mb_out = bytes / 1024 / 1024
|
128
|
+
elapsed = end_ts - begin_ts
|
129
|
+
elapsed = 1 if elapsed < 1
|
130
|
+
rate = mb_out / elapsed.to_f
|
131
|
+
rate_r = recs / elapsed
|
132
|
+
puts "#{recs} records (%.02f recs/s), #{mb_out}MB (%.02f MB/s)" %
|
133
|
+
[rate_r, rate]
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
end_ts = Time.now
|
138
|
+
mb_out = bytes / 1024 / 1024
|
139
|
+
elapsed = end_ts - begin_ts
|
140
|
+
elapsed = 1 if elapsed < 1
|
141
|
+
rate = mb_out / elapsed.to_f
|
142
|
+
rate_r = recs / elapsed
|
143
|
+
puts
|
144
|
+
puts "= total time: #{elapsed} seconds"
|
145
|
+
puts "= records: #{recs} records %.02f recs/s" % rate_r
|
146
|
+
puts "= data size: #{mb_out}MB (%.02f MB/s)" % rate
|
147
|
+
end
|
148
|
+
end
|
149
|
+
|
150
|
+
def hdfs_open_write filename
|
151
|
+
c = org.apache.hadoop.conf.Configuration.new
|
152
|
+
u = java.net.URI.create filename
|
153
|
+
p = org.apache.hadoop.fs.Path.new u
|
154
|
+
f = org.apache.hadoop.fs.FileSystem.get u, c
|
155
|
+
|
156
|
+
o = f.create p
|
157
|
+
|
158
|
+
def o.puts s
|
159
|
+
s = "#{s}\n" if s.to_s[-1].chr != "\n"
|
160
|
+
self.write_bytes s
|
161
|
+
end
|
162
|
+
|
163
|
+
return o if not block_given?
|
164
|
+
|
165
|
+
yield o
|
166
|
+
o.close
|
167
|
+
end
|
168
|
+
|
169
|
+
# main
|
170
|
+
|
171
|
+
opts = {}
|
172
|
+
output = '-'
|
173
|
+
sql = nil
|
174
|
+
|
175
|
+
gopts = GetoptLong.new(
|
176
|
+
[ '--output', '-o', GetoptLong::REQUIRED_ARGUMENT ],
|
177
|
+
[ '--connect', '-c', GetoptLong::REQUIRED_ARGUMENT ],
|
178
|
+
[ '--user', '-u', GetoptLong::REQUIRED_ARGUMENT ],
|
179
|
+
[ '--pass', '-p', GetoptLong::REQUIRED_ARGUMENT ],
|
180
|
+
[ '--query', '-e', GetoptLong::REQUIRED_ARGUMENT ],
|
181
|
+
[ '--delim', '-F', GetoptLong::REQUIRED_ARGUMENT ],
|
182
|
+
[ '--help', '-h', GetoptLong::NO_ARGUMENT ]
|
183
|
+
)
|
184
|
+
|
185
|
+
gopts.each do |opt, arg|
|
186
|
+
case opt
|
187
|
+
when '--output'
|
188
|
+
output = arg
|
189
|
+
when '--connect'
|
190
|
+
opts[:db_url] = arg
|
191
|
+
when '--user'
|
192
|
+
opts[:db_user] = arg
|
193
|
+
when '--pass'
|
194
|
+
opts[:db_pass] = arg
|
195
|
+
when '--delim'
|
196
|
+
opts[:delim] = arg
|
197
|
+
when '--query'
|
198
|
+
sql = arg
|
199
|
+
when '--help'
|
200
|
+
usage
|
201
|
+
exit
|
202
|
+
end
|
203
|
+
end
|
204
|
+
|
205
|
+
if opts[:db_user].nil? or opts[:db_pass].nil? or sql.nil?
|
206
|
+
usage
|
207
|
+
exit 1
|
208
|
+
end
|
209
|
+
|
210
|
+
ns = NoSqoop.new opts
|
211
|
+
|
212
|
+
case output
|
213
|
+
when '-' # STDOUT
|
214
|
+
ns.query sql, opts
|
215
|
+
when /^hdfs:/
|
216
|
+
hdfs_open_write(output) {|f| opts[:output] = f ; ns.query sql, opts}
|
217
|
+
else # unix file path with or without leading file://
|
218
|
+
output.sub!(%r|^file://|, '')
|
219
|
+
File.open(output, 'w') {|f| opts[:output] = f ; ns.query sql, opts}
|
220
|
+
end
|
metadata
ADDED
@@ -0,0 +1,78 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: nosqoop4u
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease: false
|
5
|
+
segments:
|
6
|
+
- 0
|
7
|
+
- 0
|
8
|
+
- 1
|
9
|
+
version: 0.0.1
|
10
|
+
platform: java
|
11
|
+
authors:
|
12
|
+
- Frank Fejes
|
13
|
+
autorequire:
|
14
|
+
bindir: bin
|
15
|
+
cert_chain: []
|
16
|
+
|
17
|
+
date: 2011-07-02 00:00:00 -05:00
|
18
|
+
default_executable:
|
19
|
+
dependencies: []
|
20
|
+
|
21
|
+
description: |
|
22
|
+
A sqoop-like jruby/jdbc query application that does not run via map/reduce.
|
23
|
+
It supports direct output to HDFS and unix filesystems as well as STDOUT.
|
24
|
+
Requires jruby 1.6+.
|
25
|
+
|
26
|
+
email: frank@fejes.net
|
27
|
+
executables:
|
28
|
+
- nosqoop4u
|
29
|
+
- nosqoop4u.rb
|
30
|
+
extensions: []
|
31
|
+
|
32
|
+
extra_rdoc_files: []
|
33
|
+
|
34
|
+
files:
|
35
|
+
- README
|
36
|
+
- CHANGELOG
|
37
|
+
- bin/nosqoop4u
|
38
|
+
- bin/nosqoop4u.rb
|
39
|
+
has_rdoc: true
|
40
|
+
homepage: https://github.com/fsfiii/nosqoop4u
|
41
|
+
licenses: []
|
42
|
+
|
43
|
+
post_install_message: |
|
44
|
+
===
|
45
|
+
Please be sure to install with:
|
46
|
+
|
47
|
+
jgem install --no-wrapper nosqoop4u
|
48
|
+
===
|
49
|
+
|
50
|
+
rdoc_options: []
|
51
|
+
|
52
|
+
require_paths:
|
53
|
+
- lib
|
54
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
55
|
+
none: false
|
56
|
+
requirements:
|
57
|
+
- - ">="
|
58
|
+
- !ruby/object:Gem::Version
|
59
|
+
segments:
|
60
|
+
- 0
|
61
|
+
version: "0"
|
62
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
63
|
+
none: false
|
64
|
+
requirements:
|
65
|
+
- - ">="
|
66
|
+
- !ruby/object:Gem::Version
|
67
|
+
segments:
|
68
|
+
- 0
|
69
|
+
version: "0"
|
70
|
+
requirements: []
|
71
|
+
|
72
|
+
rubyforge_project: nosqoop4u
|
73
|
+
rubygems_version: 1.3.7
|
74
|
+
signing_key:
|
75
|
+
specification_version: 3
|
76
|
+
summary: A sqoop-like jruby/jdbc query app that does not run via map/reduce.
|
77
|
+
test_files: []
|
78
|
+
|