sql2avro 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/Makefile ADDED
@@ -0,0 +1,6 @@
1
+ vendor/avro-tools-1.7.4.jar:
2
+ curl http://www.us.apache.org/dist/avro/avro-1.7.4/java/avro-tools-1.7.4.jar > vendor/$@
3
+
4
+ sql2avro-0.1.0.gem: sql2avro.gemspec
5
+ bundle exec gem build $<
6
+
@@ -0,0 +1,10 @@
1
+ class DbInterface
2
+ def schema(table)
3
+ raise "Return Avro JSON schema for #{table}"
4
+ end
5
+
6
+ def data(table)
7
+ raise "Return Avro JSON data for #{table}"
8
+ end
9
+ end
10
+
@@ -0,0 +1,203 @@
1
+ require_relative './interface'
2
+ require 'open3'
3
+
4
+ class MySql < DbInterface
5
+ MYSQL_BATCH_SEP = "\t"
6
+
7
+ # config is a hash with this form (like ActiveRecord's):
8
+ # {
9
+ # host: "localhost",
10
+ # username: "myuser",
11
+ # password: "mypass",
12
+ # database: "somedatabase"
13
+ # }
14
+ #
15
+ def initialize(config)
16
+ @db_host = config['host']
17
+ @db_name = config['database']
18
+ @username = config['username']
19
+ @password = config['password']
20
+ end
21
+
22
+ def schema(table)
23
+ types = avro_types(table)
24
+
25
+ schema = {
26
+ type: "record",
27
+ name: table,
28
+ fields: []
29
+ }
30
+
31
+ types.each do |k,v|
32
+ schema[:fields] << { name: k, type: ['null', v] }
33
+ end
34
+
35
+ schema
36
+ end
37
+
38
+ def max_id(table)
39
+ header_seen = false
40
+ query("SELECT MAX(id) FROM #{table}") do |line|
41
+ unless header_seen
42
+ header_seen = true
43
+ next
44
+ end
45
+
46
+ return line.first.to_i
47
+ end
48
+ end
49
+
50
+ def data(table, min_id, max_id)
51
+ columns = nil
52
+ rows = []
53
+
54
+ types = avro_types(table)
55
+
56
+ sql = """
57
+ SELECT *
58
+ FROM #{table}
59
+ WHERE id >= #{min_id}
60
+ AND id <= #{max_id}
61
+ """
62
+ query(sql) do |line|
63
+ # Get header.
64
+ if columns.nil?
65
+ columns = line
66
+ next
67
+ end
68
+
69
+ # Construct row mapping column names to values of appropriate type.
70
+ row = (0...columns.length).each_with_object({}) do |i, h|
71
+ colname = columns[i]
72
+ value = line[i]
73
+
74
+ # NOTE: all non-null type values are wrapped in a mapping from type to value,
75
+ # because that's what the Avro spec requires; see:
76
+ # - http://avro.apache.org/docs/current/spec.html#json_encoding
77
+ # - http://mail-archives.apache.org/mod_mbox/avro-user/201304.mbox/%3CCD86687D.E892E%25scott@richrelevance.com%3E
78
+
79
+ # Handle nulls.
80
+ if value == "NULL"
81
+ h[columns[i]] = nil
82
+ next
83
+ end
84
+
85
+ # Perform any necessary typecasts.
86
+ type = types[colname]
87
+ h[colname] = case type
88
+ when 'boolean'
89
+ { type => value.to_i.zero? }
90
+ when 'int','long'
91
+ { type => value.to_i }
92
+ when 'float','double'
93
+ { type => value.to_f }
94
+ when 'bytes'
95
+ { type => value }
96
+ when 'string'
97
+ { type => value }
98
+ else
99
+ raise "Unsupported type: #{type}"
100
+ end
101
+ end
102
+
103
+ rows << row
104
+ end
105
+
106
+ # TODO: stream this data out rather than return all in one batch.
107
+ rows
108
+ end
109
+
110
+ def sql_schema(table)
111
+ header_seen = false
112
+ columns = {}
113
+
114
+ query("DESCRIBE #{table}") do |line|
115
+ if header_seen == false
116
+ header_seen = true
117
+ next
118
+ end
119
+
120
+ name, type = line[0], line[1]
121
+ columns[name] = type
122
+ end
123
+
124
+ columns
125
+ end
126
+
127
+ def avro_types(table)
128
+ mysql_types = sql_schema(table)
129
+
130
+ types = {}
131
+ mysql_types.each do |k,v|
132
+ types[k] = MySql.avro_type(v)
133
+ end
134
+
135
+ types
136
+ end
137
+
138
+ def query(sql, &block)
139
+ MySql.query(sql, @db_host, @db_name, @username, @password, &block)
140
+ end
141
+
142
+ def self.query(sql, db_host, db_name, username, password, &block)
143
+ cmd = %{
144
+ mysql \\
145
+ --batch \\
146
+ --execute="#{sql}" \\
147
+ --host #{db_host} \\
148
+ --user #{username} \\
149
+ --password=#{password} \\
150
+ --quick \\
151
+ #{db_name}
152
+ }
153
+
154
+ Open3.popen3(cmd) do |stdin, stdout, stderr|
155
+ while (line = stdout.gets)
156
+ block.call(line.chop.split(MYSQL_BATCH_SEP))
157
+ end
158
+ end
159
+ end
160
+
161
+ def self.avro_type(mysql_type)
162
+ # Refer to https://github.com/apache/sqoop/blob/trunk/src/java/org/apache/sqoop/manager/ConnManager.java#L172.
163
+
164
+ case mysql_type
165
+
166
+ # See https://dev.mysql.com/doc/refman/5.0/en/numeric-type-overview.html
167
+ when /tinyint\(1\)/, /bool/, /boolean/
168
+ 'boolean'
169
+ when /tinyint/, /smallint/, /mediumint/, /integer/, /int/
170
+ 'int'
171
+ when /bigint/, /serial/
172
+ 'long'
173
+ when /decimal/, /dec/
174
+ 'string'
175
+ when /float/
176
+ 'float'
177
+ when /double/
178
+ 'double'
179
+ when /varchar\(\d+\)/
180
+ 'string'
181
+
182
+ # See https://dev.mysql.com/doc/refman/5.0/en/date-and-time-type-overview.html.
183
+ when /date/, /datetime/, /time/, /timestamp/
184
+ 'string'
185
+ when /year/
186
+ 'int'
187
+
188
+ # See https://dev.mysql.com/doc/refman/5.0/en/string-type-overview.html.
189
+ when /char/, /varchar/
190
+ 'string'
191
+ when /binary/, /varbinary/
192
+ 'bytes'
193
+ when /tinytext/, /text/, /longtext/
194
+ 'string'
195
+ when /tinyblob/, /blob/, /longblob/
196
+ 'bytes'
197
+ else
198
+ raise "Unsupported MySQL data type: #{mysql_type}"
199
+ end
200
+ end
201
+
202
+ end
203
+
data/lib/sql2avro.rb ADDED
@@ -0,0 +1,63 @@
1
+ require 'open3'
2
+ require 'yaml'
3
+ require 'yajl'
4
+ require_relative 'interface/mysql'
5
+
6
+ module Sql2Avro
7
+ AVRO_TOOLS_PATH = File.expand_path('../vendor/avro-tools-1.7.4.jar', __FILE__)
8
+
9
+
10
+ # Pulls data from the given database table starting from the given id.
11
+ #
12
+ # This function creates an Avro file as a side effect, and returns {
13
+ # max_id: greatest ID that was pulled in,
14
+ # path: filepath of the resulting avroized file
15
+ # error: error message, if any; otherwise omitted
16
+ # }
17
+ #
18
+ # database_config is a hash with this form (like ActiveRecord's):
19
+ # {
20
+ # adapter: "mysql",
21
+ # host: "localhost",
22
+ # username: "myuser",
23
+ # password: "mypass",
24
+ # database: "somedatabase"
25
+ # }
26
+ #
27
+ # table is the table to pull from.
28
+ #
29
+ # min_id specifies the value of the id column from which to start.
30
+ def Sql2Avro.avroize(database_config, table, min_id)
31
+ raise "Database interface not specified." if !database_config.has_key? 'adapter'
32
+ raise "Database interface not supported: #{database_config['adapter']}" if database_config['adapter'] != 'mysql'
33
+
34
+ interface = MySql.new(database_config)
35
+
36
+ schema = Yajl::Encoder.encode(interface.schema(table))
37
+ max_id = interface.max_id(table)
38
+
39
+ date, time, zone = Time.now.utc.to_s.split
40
+ filename = "#{table}.#{date}T#{time}Z.#{min_id}.#{max_id}.avro"
41
+
42
+ retval = {
43
+ max_id: max_id,
44
+ path: filename
45
+ }
46
+
47
+ begin
48
+ Open3.popen3("java -jar #{AVRO_TOOLS_PATH} fromjson --codec snappy --schema '#{schema}' /dev/stdin > #{filename}") do |stdin, stdout, stderr, wait_thr|
49
+ $stdout = stdout
50
+
51
+ interface.data(table, min_id, max_id).each do |datum|
52
+ Yajl::Encoder.encode(datum, stdin)
53
+ stdin.write "\n"
54
+ end
55
+ end
56
+ rescue
57
+ retval[:error] = $!
58
+ end
59
+
60
+ retval
61
+ end
62
+ end
63
+
Binary file
metadata ADDED
@@ -0,0 +1,71 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: sql2avro
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Mason Simon
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-06-20 00:00:00.000000000Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: yajl-ruby
16
+ requirement: &70282006753460 !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: *70282006753460
25
+ description: sql2avro extracts data from a specified SQL database table and transforms
26
+ it into an Avro file with a schema based on the database table's schema. The intended
27
+ use case is to incrementally load data out of an SQL database and into HDFS for
28
+ analysis via Hadoop.
29
+ email:
30
+ - mason@verbasoftware.com
31
+ executables: []
32
+ extensions: []
33
+ extra_rdoc_files: []
34
+ files:
35
+ - lib/interface/interface.rb
36
+ - lib/interface/mysql.rb
37
+ - lib/sql2avro.rb
38
+ - vendor/avro-tools-1.7.4.jar
39
+ - Makefile
40
+ homepage: https://github.com/Verba/sql2avro
41
+ licenses:
42
+ - Apache 2.0
43
+ post_install_message:
44
+ rdoc_options: []
45
+ require_paths:
46
+ - lib
47
+ required_ruby_version: !ruby/object:Gem::Requirement
48
+ none: false
49
+ requirements:
50
+ - - ! '>='
51
+ - !ruby/object:Gem::Version
52
+ version: '0'
53
+ segments:
54
+ - 0
55
+ hash: -1867060671403551677
56
+ required_rubygems_version: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ segments:
63
+ - 0
64
+ hash: -1867060671403551677
65
+ requirements: []
66
+ rubyforge_project:
67
+ rubygems_version: 1.8.10
68
+ signing_key:
69
+ specification_version: 3
70
+ summary: Tool for pulling data from SQL database tables into Avro files.
71
+ test_files: []