nosqoop4u 0.0.1-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +2 -0
- data/README +40 -0
- data/bin/nosqoop4u +33 -0
- data/bin/nosqoop4u.rb +220 -0
- metadata +78 -0
data/CHANGELOG
ADDED
data/README
ADDED
@@ -0,0 +1,40 @@
|
|
1
|
+
nosqoop4u
|
2
|
+
|
3
|
+
A sqoop-like jruby/jdbc query application that does not run via map/reduce.
|
4
|
+
It supports direct output to HDFS and unix filesystems as well as STDOUT.
|
5
|
+
|
6
|
+
Requires jruby 1.6+.
|
7
|
+
|
8
|
+
- Why write this? What's wrong with sqoop?
|
9
|
+
|
10
|
+
Nothing is wrong with sqoop. It just doesn't do quite what I need and
|
11
|
+
it's not so straightforward to debug when something goes wrong. In
|
12
|
+
addition, my Hadoop cluster does not route beyond an access layer which
|
13
|
+
means I need to create individual db-specific firewall rules in order to
|
14
|
+
use sqoop.
|
15
|
+
|
16
|
+
Also, I wanted a nice opportunity to play with jruby's java interop
|
17
|
+
and here I get to use both the JDBC and Hadoop HDFS APIs. On a side
|
18
|
+
note, I'm blown away by jruby 1.6.2...stellar performance and an
|
19
|
+
elegant seamless integration with java.
|
20
|
+
|
21
|
+
# installation
|
22
|
+
|
23
|
+
jgem install --no-wrapper nosqoop4u
|
24
|
+
|
25
|
+
Or simply copy nosqoop4u.rb and nosqoop4u into your path.
|
26
|
+
|
27
|
+
# usage
|
28
|
+
|
29
|
+
usage: nosqoop4u options
|
30
|
+
-o, --output # output file (hdfs://, file://, - for stdout)
|
31
|
+
-c, --connect url # jdbc connection url (env NS4U_URL)
|
32
|
+
-u, --user # db username (env NS4U_USER)
|
33
|
+
-p, --pass # db password (env NS4U_PASS)
|
34
|
+
-e, --query # sql query to run
|
35
|
+
-F, --delim # delimiter (default: ^A)
|
36
|
+
-h, --help
|
37
|
+
|
38
|
+
Please let me know if you find this software useful!
|
39
|
+
|
40
|
+
--frank
|
data/bin/nosqoop4u
ADDED
@@ -0,0 +1,33 @@
|
|
1
|
+
#!/bin/bash
|
2
|
+
|
3
|
+
if [ -z "$HADOOP_HOME" ]; then
|
4
|
+
echo error: HADOOP_HOME is not defined >&2
|
5
|
+
exit 1
|
6
|
+
fi
|
7
|
+
|
8
|
+
export PATH=/usr/local/jruby/bin:$PATH
|
9
|
+
if ! type -p jruby >&/dev/null; then
|
10
|
+
echo error: cannot find jruby...please set your PATH >&2
|
11
|
+
exit 1
|
12
|
+
fi
|
13
|
+
export JRUBY_OPTS="--1.9 --fast --server"
|
14
|
+
|
15
|
+
NS4U=${0%/*}/nosqoop4u.rb
|
16
|
+
if [ ! -f $NS4U ]; then
|
17
|
+
echo error: cannot find nosqoop4u.rb...please install it alongside: >&2
|
18
|
+
echo " $0" >&2
|
19
|
+
exit 2
|
20
|
+
fi
|
21
|
+
|
22
|
+
# bring in the hadoop/jdbc jars
|
23
|
+
for f in $HADOOP_HOME/hadoop-core-*.jar; do
|
24
|
+
CLASSPATH=${CLASSPATH}:$f
|
25
|
+
done
|
26
|
+
|
27
|
+
for f in $HADOOP_HOME/lib/*.jar; do
|
28
|
+
CLASSPATH=${CLASSPATH}:$f
|
29
|
+
done
|
30
|
+
|
31
|
+
export CLASSPATH
|
32
|
+
|
33
|
+
exec jruby $NS4U "$@"
|
data/bin/nosqoop4u.rb
ADDED
@@ -0,0 +1,220 @@
|
|
1
|
+
#!/usr/bin/env jruby
|
2
|
+
|
3
|
+
require 'java'
|
4
|
+
require 'rubygems'
|
5
|
+
require 'getoptlong'
|
6
|
+
|
7
|
+
def usage
|
8
|
+
puts <<-EOF
|
9
|
+
usage: nosqoop4u options
|
10
|
+
-o, --output # output file (hdfs://, file://, - for stdout)
|
11
|
+
-c, --connect url # jdbc connection url (env NS4U_URL)
|
12
|
+
-u, --user # db username (env NS4U_USER)
|
13
|
+
-p, --pass # db password (env NS4U_PASS)
|
14
|
+
-e, --query # sql query to run
|
15
|
+
-F, --delim # delimiter (default: ^A)
|
16
|
+
-h, --help
|
17
|
+
EOF
|
18
|
+
end
|
19
|
+
|
20
|
+
class NoSqoop
|
21
|
+
def initialize cfg
|
22
|
+
@db_user = cfg[:db_user] || ENV['NS4U_USER']
|
23
|
+
@db_pass = cfg[:db_pass] || ENV['NS4U_PASS']
|
24
|
+
@db_url = cfg[:db_url] || ENV['NS4U_URL']
|
25
|
+
@db_host = cfg[:db_host] || ENV['NS4U_HOST']
|
26
|
+
@db_name = cfg[:db_name] || ENV['NS4U_DB']
|
27
|
+
|
28
|
+
load_driver
|
29
|
+
connect
|
30
|
+
hack_jdbc
|
31
|
+
end
|
32
|
+
|
33
|
+
def hack_jdbc
|
34
|
+
case @db_url
|
35
|
+
when /jdbc:mysql:/
|
36
|
+
# by default, the mysql jdbc driver will read the entire table
|
37
|
+
# into memory ... this will change to only one row at a time
|
38
|
+
@stmt = @conn.create_statement java.sql.ResultSet.TYPE_FORWARD_ONLY,
|
39
|
+
java.sql.ResultSet.CONCUR_READ_ONLY
|
40
|
+
@stmt.fetch_size = java.lang.Integer.const_get 'MIN_VALUE'
|
41
|
+
when /jdbc:postgresql:/
|
42
|
+
@conn.set_auto_commit false
|
43
|
+
@stmt = @conn.create_statement
|
44
|
+
@stmt.fetch_size = 50
|
45
|
+
else
|
46
|
+
@stmt = @conn.create_statement
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
def load_driver
|
51
|
+
case @db_url
|
52
|
+
when /jdbc:mysql:/
|
53
|
+
Java::com.mysql.jdbc.Driver
|
54
|
+
# handle 0000-00-00 timestamps without an exception, lulz
|
55
|
+
#@db_url << '?zeroDateTimeBehavior=round' if @db_url !~
|
56
|
+
@db_url << '?zeroDateTimeBehavior=convertToNull' if @db_url !~
|
57
|
+
/zeroDateTimeBehavior/
|
58
|
+
when /jdbc:oracle:/
|
59
|
+
Java::oracle.jdbc.OracleDriver
|
60
|
+
when /jdbc:postgresql:/
|
61
|
+
Java::org.postgresql.Driver
|
62
|
+
else
|
63
|
+
raise "error: unknown database type"
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
def connect
|
68
|
+
@conn = java.sql.DriverManager.get_connection(@db_url, @db_user, @db_pass)
|
69
|
+
end
|
70
|
+
|
71
|
+
def table_info r
|
72
|
+
meta = r.meta_data
|
73
|
+
cols = meta.column_count
|
74
|
+
colnames = []
|
75
|
+
cols.times do |i|
|
76
|
+
colnames[i] = meta.column_name(i+1)
|
77
|
+
end
|
78
|
+
|
79
|
+
{:cols => cols, :colnames => colnames}
|
80
|
+
end
|
81
|
+
private :table_info
|
82
|
+
|
83
|
+
def query_jdbc sql, opts = {}
|
84
|
+
delim = opts[:delim] || "\001"
|
85
|
+
|
86
|
+
res = @stmt.execute_query sql
|
87
|
+
tbl = table_info res
|
88
|
+
|
89
|
+
while res.next do
|
90
|
+
s = ''
|
91
|
+
1.upto(tbl[:cols]) do |i|
|
92
|
+
data = res.get_string i
|
93
|
+
s << delim if i > 0
|
94
|
+
s << data if data
|
95
|
+
end
|
96
|
+
yield s
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
def query_cmd sql, opts = {}
|
101
|
+
cmd = %Q|PGPASSWORD=#{@db_pass} psql -t -A -F "#{@delim}" -c '#{sql}' | +
|
102
|
+
%Q|-h #{@db_host} -U #{@db_user} #{@db_name}|
|
103
|
+
p cmd
|
104
|
+
STDOUT.sync = true
|
105
|
+
IO.popen(cmd).each_line {|line| yield line}
|
106
|
+
end
|
107
|
+
|
108
|
+
def query sql, opts = {}
|
109
|
+
output = opts[:output] || STDOUT
|
110
|
+
recs = 0
|
111
|
+
bytes = 0
|
112
|
+
|
113
|
+
if opts[:query_type].to_s == 'cmd'
|
114
|
+
q = method :query_cmd
|
115
|
+
else
|
116
|
+
q = method :query_jdbc
|
117
|
+
end
|
118
|
+
|
119
|
+
begin_ts = Time.now
|
120
|
+
|
121
|
+
q.call(sql, opts) do |s|
|
122
|
+
output.puts s
|
123
|
+
bytes += s.length
|
124
|
+
recs +=1
|
125
|
+
if recs % 100000 == 0
|
126
|
+
end_ts = Time.now
|
127
|
+
mb_out = bytes / 1024 / 1024
|
128
|
+
elapsed = end_ts - begin_ts
|
129
|
+
elapsed = 1 if elapsed < 1
|
130
|
+
rate = mb_out / elapsed.to_f
|
131
|
+
rate_r = recs / elapsed
|
132
|
+
puts "#{recs} records (%.02f recs/s), #{mb_out}MB (%.02f MB/s)" %
|
133
|
+
[rate_r, rate]
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
end_ts = Time.now
|
138
|
+
mb_out = bytes / 1024 / 1024
|
139
|
+
elapsed = end_ts - begin_ts
|
140
|
+
elapsed = 1 if elapsed < 1
|
141
|
+
rate = mb_out / elapsed.to_f
|
142
|
+
rate_r = recs / elapsed
|
143
|
+
puts
|
144
|
+
puts "= total time: #{elapsed} seconds"
|
145
|
+
puts "= records: #{recs} records %.02f recs/s" % rate_r
|
146
|
+
puts "= data size: #{mb_out}MB (%.02f MB/s)" % rate
|
147
|
+
end
|
148
|
+
end
|
149
|
+
|
150
|
+
def hdfs_open_write filename
|
151
|
+
c = org.apache.hadoop.conf.Configuration.new
|
152
|
+
u = java.net.URI.create filename
|
153
|
+
p = org.apache.hadoop.fs.Path.new u
|
154
|
+
f = org.apache.hadoop.fs.FileSystem.get u, c
|
155
|
+
|
156
|
+
o = f.create p
|
157
|
+
|
158
|
+
def o.puts s
|
159
|
+
s = "#{s}\n" if s.to_s[-1].chr != "\n"
|
160
|
+
self.write_bytes s
|
161
|
+
end
|
162
|
+
|
163
|
+
return o if not block_given?
|
164
|
+
|
165
|
+
yield o
|
166
|
+
o.close
|
167
|
+
end
|
168
|
+
|
169
|
+
# main
|
170
|
+
|
171
|
+
opts = {}
|
172
|
+
output = '-'
|
173
|
+
sql = nil
|
174
|
+
|
175
|
+
gopts = GetoptLong.new(
|
176
|
+
[ '--output', '-o', GetoptLong::REQUIRED_ARGUMENT ],
|
177
|
+
[ '--connect', '-c', GetoptLong::REQUIRED_ARGUMENT ],
|
178
|
+
[ '--user', '-u', GetoptLong::REQUIRED_ARGUMENT ],
|
179
|
+
[ '--pass', '-p', GetoptLong::REQUIRED_ARGUMENT ],
|
180
|
+
[ '--query', '-e', GetoptLong::REQUIRED_ARGUMENT ],
|
181
|
+
[ '--delim', '-F', GetoptLong::REQUIRED_ARGUMENT ],
|
182
|
+
[ '--help', '-h', GetoptLong::NO_ARGUMENT ]
|
183
|
+
)
|
184
|
+
|
185
|
+
gopts.each do |opt, arg|
|
186
|
+
case opt
|
187
|
+
when '--output'
|
188
|
+
output = arg
|
189
|
+
when '--connect'
|
190
|
+
opts[:db_url] = arg
|
191
|
+
when '--user'
|
192
|
+
opts[:db_user] = arg
|
193
|
+
when '--pass'
|
194
|
+
opts[:db_pass] = arg
|
195
|
+
when '--delim'
|
196
|
+
opts[:delim] = arg
|
197
|
+
when '--query'
|
198
|
+
sql = arg
|
199
|
+
when '--help'
|
200
|
+
usage
|
201
|
+
exit
|
202
|
+
end
|
203
|
+
end
|
204
|
+
|
205
|
+
if opts[:db_user].nil? or opts[:db_pass].nil? or sql.nil?
|
206
|
+
usage
|
207
|
+
exit 1
|
208
|
+
end
|
209
|
+
|
210
|
+
ns = NoSqoop.new opts
|
211
|
+
|
212
|
+
case output
|
213
|
+
when '-' # STDOUT
|
214
|
+
ns.query sql, opts
|
215
|
+
when /^hdfs:/
|
216
|
+
hdfs_open_write(output) {|f| opts[:output] = f ; ns.query sql, opts}
|
217
|
+
else # unix file path with or without leading file://
|
218
|
+
output.sub!(%r|^file://|, '')
|
219
|
+
File.open(output, 'w') {|f| opts[:output] = f ; ns.query sql, opts}
|
220
|
+
end
|
metadata
ADDED
@@ -0,0 +1,78 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: nosqoop4u
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
prerelease: false
|
5
|
+
segments:
|
6
|
+
- 0
|
7
|
+
- 0
|
8
|
+
- 1
|
9
|
+
version: 0.0.1
|
10
|
+
platform: java
|
11
|
+
authors:
|
12
|
+
- Frank Fejes
|
13
|
+
autorequire:
|
14
|
+
bindir: bin
|
15
|
+
cert_chain: []
|
16
|
+
|
17
|
+
date: 2011-07-02 00:00:00 -05:00
|
18
|
+
default_executable:
|
19
|
+
dependencies: []
|
20
|
+
|
21
|
+
description: |
|
22
|
+
A sqoop-like jruby/jdbc query application that does not run via map/reduce.
|
23
|
+
It supports direct output to HDFS and unix filesystems as well as STDOUT.
|
24
|
+
Requires jruby 1.6+.
|
25
|
+
|
26
|
+
email: frank@fejes.net
|
27
|
+
executables:
|
28
|
+
- nosqoop4u
|
29
|
+
- nosqoop4u.rb
|
30
|
+
extensions: []
|
31
|
+
|
32
|
+
extra_rdoc_files: []
|
33
|
+
|
34
|
+
files:
|
35
|
+
- README
|
36
|
+
- CHANGELOG
|
37
|
+
- bin/nosqoop4u
|
38
|
+
- bin/nosqoop4u.rb
|
39
|
+
has_rdoc: true
|
40
|
+
homepage: https://github.com/fsfiii/nosqoop4u
|
41
|
+
licenses: []
|
42
|
+
|
43
|
+
post_install_message: |
|
44
|
+
===
|
45
|
+
Please be sure to install with:
|
46
|
+
|
47
|
+
jgem install --no-wrapper nosqoop4u
|
48
|
+
===
|
49
|
+
|
50
|
+
rdoc_options: []
|
51
|
+
|
52
|
+
require_paths:
|
53
|
+
- lib
|
54
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
55
|
+
none: false
|
56
|
+
requirements:
|
57
|
+
- - ">="
|
58
|
+
- !ruby/object:Gem::Version
|
59
|
+
segments:
|
60
|
+
- 0
|
61
|
+
version: "0"
|
62
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
63
|
+
none: false
|
64
|
+
requirements:
|
65
|
+
- - ">="
|
66
|
+
- !ruby/object:Gem::Version
|
67
|
+
segments:
|
68
|
+
- 0
|
69
|
+
version: "0"
|
70
|
+
requirements: []
|
71
|
+
|
72
|
+
rubyforge_project: nosqoop4u
|
73
|
+
rubygems_version: 1.3.7
|
74
|
+
signing_key:
|
75
|
+
specification_version: 3
|
76
|
+
summary: A sqoop-like jruby/jdbc query app that does not run via map/reduce.
|
77
|
+
test_files: []
|
78
|
+
|