nosqoop4u 0.0.1-java

Sign up to get free protection for your applications and to get access to all the features.
Files changed (5) hide show
  1. data/CHANGELOG +2 -0
  2. data/README +40 -0
  3. data/bin/nosqoop4u +33 -0
  4. data/bin/nosqoop4u.rb +220 -0
  5. metadata +78 -0
data/CHANGELOG ADDED
@@ -0,0 +1,2 @@
1
+ * 2011-07-02 - fsf
2
+ - initial import
data/README ADDED
@@ -0,0 +1,40 @@
1
+ nosqoop4u
2
+
3
+ A sqoop-like jruby/jdbc query application that does not run via map/reduce.
4
+ It supports direct output to HDFS and unix filesystems as well as STDOUT.
5
+
6
+ Requires jruby 1.6+.
7
+
8
+ - Why write this? What's wrong with sqoop?
9
+
10
+ Nothing is wrong with sqoop. It just doesn't do quite what I need and
11
+ it's not so straightforward to debug when something goes wrong. In
12
+ addition, my Hadoop cluster does not route beyond an access layer which
13
+ means I need to create individual db-specific firewall rules in order to
14
+ use sqoop.
15
+
16
+ Also, I wanted a nice opportunity to play with jruby's java interop
17
+ and here I get to use both the JDBC and Hadoop HDFS APIs. On a side
18
+ note, I'm blown away by jruby 1.6.2...stellar performance and an
19
+ elegant seamless integration with java.
20
+
21
+ # installation
22
+
23
+ jgem install --no-wrapper nosqoop4u
24
+
25
+ Or simply copy nosqoop4u.rb and nosqoop4u into your path.
26
+
27
+ # usage
28
+
29
+ usage: nosqoop4u options
30
+ -o, --output # output file (hdfs://, file://, - for stdout)
31
+ -c, --connect url # jdbc connection url (env NS4U_URL)
32
+ -u, --user # db username (env NS4U_USER)
33
+ -p, --pass # db password (env NS4U_PASS)
34
+ -e, --query # sql query to run
35
+ -F, --delim # delimiter (default: ^A)
36
+ -h, --help
37
+
38
+ Please let me know if you find this software useful!
39
+
40
+ --frank
data/bin/nosqoop4u ADDED
@@ -0,0 +1,33 @@
1
+ #!/bin/bash
2
+
3
+ if [ -z "$HADOOP_HOME" ]; then
4
+ echo error: HADOOP_HOME is not defined >&2
5
+ exit 1
6
+ fi
7
+
8
+ export PATH=/usr/local/jruby/bin:$PATH
9
+ if ! type -p jruby >&/dev/null; then
10
+ echo error: cannot find jruby...please set your PATH >&2
11
+ exit 1
12
+ fi
13
+ export JRUBY_OPTS="--1.9 --fast --server"
14
+
15
+ NS4U=${0%/*}/nosqoop4u.rb
16
+ if [ ! -f $NS4U ]; then
17
+ echo error: cannot find nosqoop4u.rb...please install it alongside: >&2
18
+ echo " $0" >&2
19
+ exit 2
20
+ fi
21
+
22
+ # bring in the hadoop/jdbc jars
23
+ for f in $HADOOP_HOME/hadoop-core-*.jar; do
24
+ CLASSPATH=${CLASSPATH}:$f
25
+ done
26
+
27
+ for f in $HADOOP_HOME/lib/*.jar; do
28
+ CLASSPATH=${CLASSPATH}:$f
29
+ done
30
+
31
+ export CLASSPATH
32
+
33
+ exec jruby $NS4U "$@"
data/bin/nosqoop4u.rb ADDED
@@ -0,0 +1,220 @@
1
+ #!/usr/bin/env jruby
2
+
3
+ require 'java'
4
+ require 'rubygems'
5
+ require 'getoptlong'
6
+
7
+ def usage
8
+ puts <<-EOF
9
+ usage: nosqoop4u options
10
+ -o, --output # output file (hdfs://, file://, - for stdout)
11
+ -c, --connect url # jdbc connection url (env NS4U_URL)
12
+ -u, --user # db username (env NS4U_USER)
13
+ -p, --pass # db password (env NS4U_PASS)
14
+ -e, --query # sql query to run
15
+ -F, --delim # delimiter (default: ^A)
16
+ -h, --help
17
+ EOF
18
+ end
19
+
20
+ class NoSqoop
21
+ def initialize cfg
22
+ @db_user = cfg[:db_user] || ENV['NS4U_USER']
23
+ @db_pass = cfg[:db_pass] || ENV['NS4U_PASS']
24
+ @db_url = cfg[:db_url] || ENV['NS4U_URL']
25
+ @db_host = cfg[:db_host] || ENV['NS4U_HOST']
26
+ @db_name = cfg[:db_name] || ENV['NS4U_DB']
27
+
28
+ load_driver
29
+ connect
30
+ hack_jdbc
31
+ end
32
+
33
+ def hack_jdbc
34
+ case @db_url
35
+ when /jdbc:mysql:/
36
+ # by default, the mysql jdbc driver will read the entire table
37
+ # into memory ... this will change to only one row at a time
38
+ @stmt = @conn.create_statement java.sql.ResultSet.TYPE_FORWARD_ONLY,
39
+ java.sql.ResultSet.CONCUR_READ_ONLY
40
+ @stmt.fetch_size = java.lang.Integer.const_get 'MIN_VALUE'
41
+ when /jdbc:postgresql:/
42
+ @conn.set_auto_commit false
43
+ @stmt = @conn.create_statement
44
+ @stmt.fetch_size = 50
45
+ else
46
+ @stmt = @conn.create_statement
47
+ end
48
+ end
49
+
50
+ def load_driver
51
+ case @db_url
52
+ when /jdbc:mysql:/
53
+ Java::com.mysql.jdbc.Driver
54
+ # handle 0000-00-00 timestamps without an exception, lulz
55
+ #@db_url << '?zeroDateTimeBehavior=round' if @db_url !~
56
+ @db_url << '?zeroDateTimeBehavior=convertToNull' if @db_url !~
57
+ /zeroDateTimeBehavior/
58
+ when /jdbc:oracle:/
59
+ Java::oracle.jdbc.OracleDriver
60
+ when /jdbc:postgresql:/
61
+ Java::org.postgresql.Driver
62
+ else
63
+ raise "error: unknown database type"
64
+ end
65
+ end
66
+
67
+ def connect
68
+ @conn = java.sql.DriverManager.get_connection(@db_url, @db_user, @db_pass)
69
+ end
70
+
71
+ def table_info r
72
+ meta = r.meta_data
73
+ cols = meta.column_count
74
+ colnames = []
75
+ cols.times do |i|
76
+ colnames[i] = meta.column_name(i+1)
77
+ end
78
+
79
+ {:cols => cols, :colnames => colnames}
80
+ end
81
+ private :table_info
82
+
83
+ def query_jdbc sql, opts = {}
84
+ delim = opts[:delim] || "\001"
85
+
86
+ res = @stmt.execute_query sql
87
+ tbl = table_info res
88
+
89
+ while res.next do
90
+ s = ''
91
+ 1.upto(tbl[:cols]) do |i|
92
+ data = res.get_string i
93
+ s << delim if i > 0
94
+ s << data if data
95
+ end
96
+ yield s
97
+ end
98
+ end
99
+
100
+ def query_cmd sql, opts = {}
101
+ cmd = %Q|PGPASSWORD=#{@db_pass} psql -t -A -F "#{@delim}" -c '#{sql}' | +
102
+ %Q|-h #{@db_host} -U #{@db_user} #{@db_name}|
103
+ p cmd
104
+ STDOUT.sync = true
105
+ IO.popen(cmd).each_line {|line| yield line}
106
+ end
107
+
108
+ def query sql, opts = {}
109
+ output = opts[:output] || STDOUT
110
+ recs = 0
111
+ bytes = 0
112
+
113
+ if opts[:query_type].to_s == 'cmd'
114
+ q = method :query_cmd
115
+ else
116
+ q = method :query_jdbc
117
+ end
118
+
119
+ begin_ts = Time.now
120
+
121
+ q.call(sql, opts) do |s|
122
+ output.puts s
123
+ bytes += s.length
124
+ recs +=1
125
+ if recs % 100000 == 0
126
+ end_ts = Time.now
127
+ mb_out = bytes / 1024 / 1024
128
+ elapsed = end_ts - begin_ts
129
+ elapsed = 1 if elapsed < 1
130
+ rate = mb_out / elapsed.to_f
131
+ rate_r = recs / elapsed
132
+ puts "#{recs} records (%.02f recs/s), #{mb_out}MB (%.02f MB/s)" %
133
+ [rate_r, rate]
134
+ end
135
+ end
136
+
137
+ end_ts = Time.now
138
+ mb_out = bytes / 1024 / 1024
139
+ elapsed = end_ts - begin_ts
140
+ elapsed = 1 if elapsed < 1
141
+ rate = mb_out / elapsed.to_f
142
+ rate_r = recs / elapsed
143
+ puts
144
+ puts "= total time: #{elapsed} seconds"
145
+ puts "= records: #{recs} records %.02f recs/s" % rate_r
146
+ puts "= data size: #{mb_out}MB (%.02f MB/s)" % rate
147
+ end
148
+ end
149
+
150
+ def hdfs_open_write filename
151
+ c = org.apache.hadoop.conf.Configuration.new
152
+ u = java.net.URI.create filename
153
+ p = org.apache.hadoop.fs.Path.new u
154
+ f = org.apache.hadoop.fs.FileSystem.get u, c
155
+
156
+ o = f.create p
157
+
158
+ def o.puts s
159
+ s = "#{s}\n" if s.to_s[-1].chr != "\n"
160
+ self.write_bytes s
161
+ end
162
+
163
+ return o if not block_given?
164
+
165
+ yield o
166
+ o.close
167
+ end
168
+
169
+ # main
170
+
171
+ opts = {}
172
+ output = '-'
173
+ sql = nil
174
+
175
+ gopts = GetoptLong.new(
176
+ [ '--output', '-o', GetoptLong::REQUIRED_ARGUMENT ],
177
+ [ '--connect', '-c', GetoptLong::REQUIRED_ARGUMENT ],
178
+ [ '--user', '-u', GetoptLong::REQUIRED_ARGUMENT ],
179
+ [ '--pass', '-p', GetoptLong::REQUIRED_ARGUMENT ],
180
+ [ '--query', '-e', GetoptLong::REQUIRED_ARGUMENT ],
181
+ [ '--delim', '-F', GetoptLong::REQUIRED_ARGUMENT ],
182
+ [ '--help', '-h', GetoptLong::NO_ARGUMENT ]
183
+ )
184
+
185
+ gopts.each do |opt, arg|
186
+ case opt
187
+ when '--output'
188
+ output = arg
189
+ when '--connect'
190
+ opts[:db_url] = arg
191
+ when '--user'
192
+ opts[:db_user] = arg
193
+ when '--pass'
194
+ opts[:db_pass] = arg
195
+ when '--delim'
196
+ opts[:delim] = arg
197
+ when '--query'
198
+ sql = arg
199
+ when '--help'
200
+ usage
201
+ exit
202
+ end
203
+ end
204
+
205
+ if opts[:db_user].nil? or opts[:db_pass].nil? or sql.nil?
206
+ usage
207
+ exit 1
208
+ end
209
+
210
+ ns = NoSqoop.new opts
211
+
212
+ case output
213
+ when '-' # STDOUT
214
+ ns.query sql, opts
215
+ when /^hdfs:/
216
+ hdfs_open_write(output) {|f| opts[:output] = f ; ns.query sql, opts}
217
+ else # unix file path with or without leading file://
218
+ output.sub!(%r|^file://|, '')
219
+ File.open(output, 'w') {|f| opts[:output] = f ; ns.query sql, opts}
220
+ end
metadata ADDED
@@ -0,0 +1,78 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: nosqoop4u
3
+ version: !ruby/object:Gem::Version
4
+ prerelease: false
5
+ segments:
6
+ - 0
7
+ - 0
8
+ - 1
9
+ version: 0.0.1
10
+ platform: java
11
+ authors:
12
+ - Frank Fejes
13
+ autorequire:
14
+ bindir: bin
15
+ cert_chain: []
16
+
17
+ date: 2011-07-02 00:00:00 -05:00
18
+ default_executable:
19
+ dependencies: []
20
+
21
+ description: |
22
+ A sqoop-like jruby/jdbc query application that does not run via map/reduce.
23
+ It supports direct output to HDFS and unix filesystems as well as STDOUT.
24
+ Requires jruby 1.6+.
25
+
26
+ email: frank@fejes.net
27
+ executables:
28
+ - nosqoop4u
29
+ - nosqoop4u.rb
30
+ extensions: []
31
+
32
+ extra_rdoc_files: []
33
+
34
+ files:
35
+ - README
36
+ - CHANGELOG
37
+ - bin/nosqoop4u
38
+ - bin/nosqoop4u.rb
39
+ has_rdoc: true
40
+ homepage: https://github.com/fsfiii/nosqoop4u
41
+ licenses: []
42
+
43
+ post_install_message: |
44
+ ===
45
+ Please be sure to install with:
46
+
47
+ jgem install --no-wrapper nosqoop4u
48
+ ===
49
+
50
+ rdoc_options: []
51
+
52
+ require_paths:
53
+ - lib
54
+ required_ruby_version: !ruby/object:Gem::Requirement
55
+ none: false
56
+ requirements:
57
+ - - ">="
58
+ - !ruby/object:Gem::Version
59
+ segments:
60
+ - 0
61
+ version: "0"
62
+ required_rubygems_version: !ruby/object:Gem::Requirement
63
+ none: false
64
+ requirements:
65
+ - - ">="
66
+ - !ruby/object:Gem::Version
67
+ segments:
68
+ - 0
69
+ version: "0"
70
+ requirements: []
71
+
72
+ rubyforge_project: nosqoop4u
73
+ rubygems_version: 1.3.7
74
+ signing_key:
75
+ specification_version: 3
76
+ summary: A sqoop-like jruby/jdbc query app that does not run via map/reduce.
77
+ test_files: []
78
+