nosqoop4u 0.0.1-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (5) hide show
  1. data/CHANGELOG +2 -0
  2. data/README +40 -0
  3. data/bin/nosqoop4u +33 -0
  4. data/bin/nosqoop4u.rb +220 -0
  5. metadata +78 -0
data/CHANGELOG ADDED
@@ -0,0 +1,2 @@
1
+ * 2011-07-02 - fsf
2
+ - initial import
data/README ADDED
@@ -0,0 +1,40 @@
1
+ nosqoop4u
2
+
3
+ A sqoop-like jruby/jdbc query application that does not run via map/reduce.
4
+ It supports direct output to HDFS and unix filesystems as well as STDOUT.
5
+
6
+ Requires jruby 1.6+.
7
+
8
+ - Why write this? What's wrong with sqoop?
9
+
10
+ Nothing is wrong with sqoop. It just doesn't do quite what I need and
11
+ it's not so straightforward to debug when something goes wrong. In
12
+ addition, my Hadoop cluster does not route beyond an access layer which
13
+ means I need to create individual db-specific firewall rules in order to
14
+ use sqoop.
15
+
16
+ Also, I wanted a nice opportunity to play with jruby's java interop
17
+ and here I get to use both the JDBC and Hadoop HDFS APIs. On a side
18
+ note, I'm blown away by jruby 1.6.2...stellar performance and an
19
+ elegant seamless integration with java.
20
+
21
+ # installation
22
+
23
+ jgem install --no-wrapper nosqoop4u
24
+
25
+ Or simply copy nosqoop4u.rb and nosqoop4u into your path.
26
+
27
+ # usage
28
+
29
+ usage: nosqoop4u options
30
+ -o, --output # output file (hdfs://, file://, - for stdout)
31
+ -c, --connect url # jdbc connection url (env NS4U_URL)
32
+ -u, --user # db username (env NS4U_USER)
33
+ -p, --pass # db password (env NS4U_PASS)
34
+ -e, --query # sql query to run
35
+ -F, --delim # delimiter (default: ^A)
36
+ -h, --help
37
+
38
+ Please let me know if you find this software useful!
39
+
40
+ --frank
data/bin/nosqoop4u ADDED
@@ -0,0 +1,33 @@
1
+ #!/bin/bash
2
+
3
+ if [ -z "$HADOOP_HOME" ]; then
4
+ echo error: HADOOP_HOME is not defined >&2
5
+ exit 1
6
+ fi
7
+
8
+ export PATH=/usr/local/jruby/bin:$PATH
9
+ if ! type -p jruby >&/dev/null; then
10
+ echo error: cannot find jruby...please set your PATH >&2
11
+ exit 1
12
+ fi
13
+ export JRUBY_OPTS="--1.9 --fast --server"
14
+
15
+ NS4U=${0%/*}/nosqoop4u.rb
16
+ if [ ! -f $NS4U ]; then
17
+ echo error: cannot find nosqoop4u.rb...please install it alongside: >&2
18
+ echo " $0" >&2
19
+ exit 2
20
+ fi
21
+
22
+ # bring in the hadoop/jdbc jars
23
+ for f in $HADOOP_HOME/hadoop-core-*.jar; do
24
+ CLASSPATH=${CLASSPATH}:$f
25
+ done
26
+
27
+ for f in $HADOOP_HOME/lib/*.jar; do
28
+ CLASSPATH=${CLASSPATH}:$f
29
+ done
30
+
31
+ export CLASSPATH
32
+
33
+ exec jruby $NS4U "$@"
data/bin/nosqoop4u.rb ADDED
@@ -0,0 +1,220 @@
1
+ #!/usr/bin/env jruby
2
+
3
+ require 'java'
4
+ require 'rubygems'
5
+ require 'getoptlong'
6
+
7
+ def usage
8
+ puts <<-EOF
9
+ usage: nosqoop4u options
10
+ -o, --output # output file (hdfs://, file://, - for stdout)
11
+ -c, --connect url # jdbc connection url (env NS4U_URL)
12
+ -u, --user # db username (env NS4U_USER)
13
+ -p, --pass # db password (env NS4U_PASS)
14
+ -e, --query # sql query to run
15
+ -F, --delim # delimiter (default: ^A)
16
+ -h, --help
17
+ EOF
18
+ end
19
+
20
+ class NoSqoop
21
+ def initialize cfg
22
+ @db_user = cfg[:db_user] || ENV['NS4U_USER']
23
+ @db_pass = cfg[:db_pass] || ENV['NS4U_PASS']
24
+ @db_url = cfg[:db_url] || ENV['NS4U_URL']
25
+ @db_host = cfg[:db_host] || ENV['NS4U_HOST']
26
+ @db_name = cfg[:db_name] || ENV['NS4U_DB']
27
+
28
+ load_driver
29
+ connect
30
+ hack_jdbc
31
+ end
32
+
33
+ def hack_jdbc
34
+ case @db_url
35
+ when /jdbc:mysql:/
36
+ # by default, the mysql jdbc driver will read the entire table
37
+ # into memory ... this will change to only one row at a time
38
+ @stmt = @conn.create_statement java.sql.ResultSet.TYPE_FORWARD_ONLY,
39
+ java.sql.ResultSet.CONCUR_READ_ONLY
40
+ @stmt.fetch_size = java.lang.Integer.const_get 'MIN_VALUE'
41
+ when /jdbc:postgresql:/
42
+ @conn.set_auto_commit false
43
+ @stmt = @conn.create_statement
44
+ @stmt.fetch_size = 50
45
+ else
46
+ @stmt = @conn.create_statement
47
+ end
48
+ end
49
+
50
+ def load_driver
51
+ case @db_url
52
+ when /jdbc:mysql:/
53
+ Java::com.mysql.jdbc.Driver
54
+ # handle 0000-00-00 timestamps without an exception, lulz
55
+ #@db_url << '?zeroDateTimeBehavior=round' if @db_url !~
56
+ @db_url << '?zeroDateTimeBehavior=convertToNull' if @db_url !~
57
+ /zeroDateTimeBehavior/
58
+ when /jdbc:oracle:/
59
+ Java::oracle.jdbc.OracleDriver
60
+ when /jdbc:postgresql:/
61
+ Java::org.postgresql.Driver
62
+ else
63
+ raise "error: unknown database type"
64
+ end
65
+ end
66
+
67
+ def connect
68
+ @conn = java.sql.DriverManager.get_connection(@db_url, @db_user, @db_pass)
69
+ end
70
+
71
+ def table_info r
72
+ meta = r.meta_data
73
+ cols = meta.column_count
74
+ colnames = []
75
+ cols.times do |i|
76
+ colnames[i] = meta.column_name(i+1)
77
+ end
78
+
79
+ {:cols => cols, :colnames => colnames}
80
+ end
81
+ private :table_info
82
+
83
+ def query_jdbc sql, opts = {}
84
+ delim = opts[:delim] || "\001"
85
+
86
+ res = @stmt.execute_query sql
87
+ tbl = table_info res
88
+
89
+ while res.next do
90
+ s = ''
91
+ 1.upto(tbl[:cols]) do |i|
92
+ data = res.get_string i
93
+ s << delim if i > 0
94
+ s << data if data
95
+ end
96
+ yield s
97
+ end
98
+ end
99
+
100
+ def query_cmd sql, opts = {}
101
+ cmd = %Q|PGPASSWORD=#{@db_pass} psql -t -A -F "#{@delim}" -c '#{sql}' | +
102
+ %Q|-h #{@db_host} -U #{@db_user} #{@db_name}|
103
+ p cmd
104
+ STDOUT.sync = true
105
+ IO.popen(cmd).each_line {|line| yield line}
106
+ end
107
+
108
+ def query sql, opts = {}
109
+ output = opts[:output] || STDOUT
110
+ recs = 0
111
+ bytes = 0
112
+
113
+ if opts[:query_type].to_s == 'cmd'
114
+ q = method :query_cmd
115
+ else
116
+ q = method :query_jdbc
117
+ end
118
+
119
+ begin_ts = Time.now
120
+
121
+ q.call(sql, opts) do |s|
122
+ output.puts s
123
+ bytes += s.length
124
+ recs +=1
125
+ if recs % 100000 == 0
126
+ end_ts = Time.now
127
+ mb_out = bytes / 1024 / 1024
128
+ elapsed = end_ts - begin_ts
129
+ elapsed = 1 if elapsed < 1
130
+ rate = mb_out / elapsed.to_f
131
+ rate_r = recs / elapsed
132
+ puts "#{recs} records (%.02f recs/s), #{mb_out}MB (%.02f MB/s)" %
133
+ [rate_r, rate]
134
+ end
135
+ end
136
+
137
+ end_ts = Time.now
138
+ mb_out = bytes / 1024 / 1024
139
+ elapsed = end_ts - begin_ts
140
+ elapsed = 1 if elapsed < 1
141
+ rate = mb_out / elapsed.to_f
142
+ rate_r = recs / elapsed
143
+ puts
144
+ puts "= total time: #{elapsed} seconds"
145
+ puts "= records: #{recs} records %.02f recs/s" % rate_r
146
+ puts "= data size: #{mb_out}MB (%.02f MB/s)" % rate
147
+ end
148
+ end
149
+
150
+ def hdfs_open_write filename
151
+ c = org.apache.hadoop.conf.Configuration.new
152
+ u = java.net.URI.create filename
153
+ p = org.apache.hadoop.fs.Path.new u
154
+ f = org.apache.hadoop.fs.FileSystem.get u, c
155
+
156
+ o = f.create p
157
+
158
+ def o.puts s
159
+ s = "#{s}\n" if s.to_s[-1].chr != "\n"
160
+ self.write_bytes s
161
+ end
162
+
163
+ return o if not block_given?
164
+
165
+ yield o
166
+ o.close
167
+ end
168
+
169
+ # main
170
+
171
+ opts = {}
172
+ output = '-'
173
+ sql = nil
174
+
175
+ gopts = GetoptLong.new(
176
+ [ '--output', '-o', GetoptLong::REQUIRED_ARGUMENT ],
177
+ [ '--connect', '-c', GetoptLong::REQUIRED_ARGUMENT ],
178
+ [ '--user', '-u', GetoptLong::REQUIRED_ARGUMENT ],
179
+ [ '--pass', '-p', GetoptLong::REQUIRED_ARGUMENT ],
180
+ [ '--query', '-e', GetoptLong::REQUIRED_ARGUMENT ],
181
+ [ '--delim', '-F', GetoptLong::REQUIRED_ARGUMENT ],
182
+ [ '--help', '-h', GetoptLong::NO_ARGUMENT ]
183
+ )
184
+
185
+ gopts.each do |opt, arg|
186
+ case opt
187
+ when '--output'
188
+ output = arg
189
+ when '--connect'
190
+ opts[:db_url] = arg
191
+ when '--user'
192
+ opts[:db_user] = arg
193
+ when '--pass'
194
+ opts[:db_pass] = arg
195
+ when '--delim'
196
+ opts[:delim] = arg
197
+ when '--query'
198
+ sql = arg
199
+ when '--help'
200
+ usage
201
+ exit
202
+ end
203
+ end
204
+
205
+ if opts[:db_user].nil? or opts[:db_pass].nil? or sql.nil?
206
+ usage
207
+ exit 1
208
+ end
209
+
210
+ ns = NoSqoop.new opts
211
+
212
+ case output
213
+ when '-' # STDOUT
214
+ ns.query sql, opts
215
+ when /^hdfs:/
216
+ hdfs_open_write(output) {|f| opts[:output] = f ; ns.query sql, opts}
217
+ else # unix file path with or without leading file://
218
+ output.sub!(%r|^file://|, '')
219
+ File.open(output, 'w') {|f| opts[:output] = f ; ns.query sql, opts}
220
+ end
metadata ADDED
@@ -0,0 +1,78 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: nosqoop4u
3
+ version: !ruby/object:Gem::Version
4
+ prerelease: false
5
+ segments:
6
+ - 0
7
+ - 0
8
+ - 1
9
+ version: 0.0.1
10
+ platform: java
11
+ authors:
12
+ - Frank Fejes
13
+ autorequire:
14
+ bindir: bin
15
+ cert_chain: []
16
+
17
+ date: 2011-07-02 00:00:00 -05:00
18
+ default_executable:
19
+ dependencies: []
20
+
21
+ description: |
22
+ A sqoop-like jruby/jdbc query application that does not run via map/reduce.
23
+ It supports direct output to HDFS and unix filesystems as well as STDOUT.
24
+ Requires jruby 1.6+.
25
+
26
+ email: frank@fejes.net
27
+ executables:
28
+ - nosqoop4u
29
+ - nosqoop4u.rb
30
+ extensions: []
31
+
32
+ extra_rdoc_files: []
33
+
34
+ files:
35
+ - README
36
+ - CHANGELOG
37
+ - bin/nosqoop4u
38
+ - bin/nosqoop4u.rb
39
+ has_rdoc: true
40
+ homepage: https://github.com/fsfiii/nosqoop4u
41
+ licenses: []
42
+
43
+ post_install_message: |
44
+ ===
45
+ Please be sure to install with:
46
+
47
+ jgem install --no-wrapper nosqoop4u
48
+ ===
49
+
50
+ rdoc_options: []
51
+
52
+ require_paths:
53
+ - lib
54
+ required_ruby_version: !ruby/object:Gem::Requirement
55
+ none: false
56
+ requirements:
57
+ - - ">="
58
+ - !ruby/object:Gem::Version
59
+ segments:
60
+ - 0
61
+ version: "0"
62
+ required_rubygems_version: !ruby/object:Gem::Requirement
63
+ none: false
64
+ requirements:
65
+ - - ">="
66
+ - !ruby/object:Gem::Version
67
+ segments:
68
+ - 0
69
+ version: "0"
70
+ requirements: []
71
+
72
+ rubyforge_project: nosqoop4u
73
+ rubygems_version: 1.3.7
74
+ signing_key:
75
+ specification_version: 3
76
+ summary: A sqoop-like jruby/jdbc query app that does not run via map/reduce.
77
+ test_files: []
78
+