sql2avro 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Makefile +6 -0
- data/lib/interface/interface.rb +10 -0
- data/lib/interface/mysql.rb +203 -0
- data/lib/sql2avro.rb +63 -0
- data/vendor/avro-tools-1.7.4.jar +0 -0
- metadata +71 -0
data/Makefile
ADDED
@@ -0,0 +1,203 @@
|
|
1
|
+
require_relative './interface'
|
2
|
+
require 'open3'
|
3
|
+
|
4
|
+
class MySql < DbInterface
|
5
|
+
MYSQL_BATCH_SEP = "\t"
|
6
|
+
|
7
|
+
# config is a hash with this form (like ActiveRecord's):
|
8
|
+
# {
|
9
|
+
# host: "localhost",
|
10
|
+
# username: "myuser",
|
11
|
+
# password: "mypass",
|
12
|
+
# database: "somedatabase"
|
13
|
+
# }
|
14
|
+
#
|
15
|
+
def initialize(config)
|
16
|
+
@db_host = config['host']
|
17
|
+
@db_name = config['database']
|
18
|
+
@username = config['username']
|
19
|
+
@password = config['password']
|
20
|
+
end
|
21
|
+
|
22
|
+
def schema(table)
|
23
|
+
types = avro_types(table)
|
24
|
+
|
25
|
+
schema = {
|
26
|
+
type: "record",
|
27
|
+
name: table,
|
28
|
+
fields: []
|
29
|
+
}
|
30
|
+
|
31
|
+
types.each do |k,v|
|
32
|
+
schema[:fields] << { name: k, type: ['null', v] }
|
33
|
+
end
|
34
|
+
|
35
|
+
schema
|
36
|
+
end
|
37
|
+
|
38
|
+
def max_id(table)
|
39
|
+
header_seen = false
|
40
|
+
query("SELECT MAX(id) FROM #{table}") do |line|
|
41
|
+
unless header_seen
|
42
|
+
header_seen = true
|
43
|
+
next
|
44
|
+
end
|
45
|
+
|
46
|
+
return line.first.to_i
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
def data(table, min_id, max_id)
|
51
|
+
columns = nil
|
52
|
+
rows = []
|
53
|
+
|
54
|
+
types = avro_types(table)
|
55
|
+
|
56
|
+
sql = """
|
57
|
+
SELECT *
|
58
|
+
FROM #{table}
|
59
|
+
WHERE id >= #{min_id}
|
60
|
+
AND id <= #{max_id}
|
61
|
+
"""
|
62
|
+
query(sql) do |line|
|
63
|
+
# Get header.
|
64
|
+
if columns.nil?
|
65
|
+
columns = line
|
66
|
+
next
|
67
|
+
end
|
68
|
+
|
69
|
+
# Construct row mapping column names to values of appropriate type.
|
70
|
+
row = (0...columns.length).each_with_object({}) do |i, h|
|
71
|
+
colname = columns[i]
|
72
|
+
value = line[i]
|
73
|
+
|
74
|
+
# NOTE: all non-null type values are wrapped in a mapping from type to value,
|
75
|
+
# because that's what the Avro spec requires; see:
|
76
|
+
# - http://avro.apache.org/docs/current/spec.html#json_encoding
|
77
|
+
# - http://mail-archives.apache.org/mod_mbox/avro-user/201304.mbox/%3CCD86687D.E892E%25scott@richrelevance.com%3E
|
78
|
+
|
79
|
+
# Handle nulls.
|
80
|
+
if value == "NULL"
|
81
|
+
h[columns[i]] = nil
|
82
|
+
next
|
83
|
+
end
|
84
|
+
|
85
|
+
# Perform any necessary typecasts.
|
86
|
+
type = types[colname]
|
87
|
+
h[colname] = case type
|
88
|
+
when 'boolean'
|
89
|
+
{ type => value.to_i.zero? }
|
90
|
+
when 'int','long'
|
91
|
+
{ type => value.to_i }
|
92
|
+
when 'float','double'
|
93
|
+
{ type => value.to_f }
|
94
|
+
when 'bytes'
|
95
|
+
{ type => value }
|
96
|
+
when 'string'
|
97
|
+
{ type => value }
|
98
|
+
else
|
99
|
+
raise "Unsupported type: #{type}"
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
rows << row
|
104
|
+
end
|
105
|
+
|
106
|
+
# TODO: stream this data out rather than return all in one batch.
|
107
|
+
rows
|
108
|
+
end
|
109
|
+
|
110
|
+
def sql_schema(table)
|
111
|
+
header_seen = false
|
112
|
+
columns = {}
|
113
|
+
|
114
|
+
query("DESCRIBE #{table}") do |line|
|
115
|
+
if header_seen == false
|
116
|
+
header_seen = true
|
117
|
+
next
|
118
|
+
end
|
119
|
+
|
120
|
+
name, type = line[0], line[1]
|
121
|
+
columns[name] = type
|
122
|
+
end
|
123
|
+
|
124
|
+
columns
|
125
|
+
end
|
126
|
+
|
127
|
+
def avro_types(table)
|
128
|
+
mysql_types = sql_schema(table)
|
129
|
+
|
130
|
+
types = {}
|
131
|
+
mysql_types.each do |k,v|
|
132
|
+
types[k] = MySql.avro_type(v)
|
133
|
+
end
|
134
|
+
|
135
|
+
types
|
136
|
+
end
|
137
|
+
|
138
|
+
def query(sql, &block)
|
139
|
+
MySql.query(sql, @db_host, @db_name, @username, @password, &block)
|
140
|
+
end
|
141
|
+
|
142
|
+
def self.query(sql, db_host, db_name, username, password, &block)
|
143
|
+
cmd = %{
|
144
|
+
mysql \\
|
145
|
+
--batch \\
|
146
|
+
--execute="#{sql}" \\
|
147
|
+
--host #{db_host} \\
|
148
|
+
--user #{username} \\
|
149
|
+
--password=#{password} \\
|
150
|
+
--quick \\
|
151
|
+
#{db_name}
|
152
|
+
}
|
153
|
+
|
154
|
+
Open3.popen3(cmd) do |stdin, stdout, stderr|
|
155
|
+
while (line = stdout.gets)
|
156
|
+
block.call(line.chop.split(MYSQL_BATCH_SEP))
|
157
|
+
end
|
158
|
+
end
|
159
|
+
end
|
160
|
+
|
161
|
+
def self.avro_type(mysql_type)
|
162
|
+
# Refer to https://github.com/apache/sqoop/blob/trunk/src/java/org/apache/sqoop/manager/ConnManager.java#L172.
|
163
|
+
|
164
|
+
case mysql_type
|
165
|
+
|
166
|
+
# See https://dev.mysql.com/doc/refman/5.0/en/numeric-type-overview.html
|
167
|
+
when /tinyint\(1\)/, /bool/, /boolean/
|
168
|
+
'boolean'
|
169
|
+
when /tinyint/, /smallint/, /mediumint/, /integer/, /int/
|
170
|
+
'int'
|
171
|
+
when /bigint/, /serial/
|
172
|
+
'long'
|
173
|
+
when /decimal/, /dec/
|
174
|
+
'string'
|
175
|
+
when /float/
|
176
|
+
'float'
|
177
|
+
when /double/
|
178
|
+
'double'
|
179
|
+
when /varchar\(\d+\)/
|
180
|
+
'string'
|
181
|
+
|
182
|
+
# See https://dev.mysql.com/doc/refman/5.0/en/date-and-time-type-overview.html.
|
183
|
+
when /date/, /datetime/, /time/, /timestamp/
|
184
|
+
'string'
|
185
|
+
when /year/
|
186
|
+
'int'
|
187
|
+
|
188
|
+
# See https://dev.mysql.com/doc/refman/5.0/en/string-type-overview.html.
|
189
|
+
when /char/, /varchar/
|
190
|
+
'string'
|
191
|
+
when /binary/, /varbinary/
|
192
|
+
'bytes'
|
193
|
+
when /tinytext/, /text/, /longtext/
|
194
|
+
'string'
|
195
|
+
when /tinyblob/, /blob/, /longblob/
|
196
|
+
'bytes'
|
197
|
+
else
|
198
|
+
raise "Unsupported MySQL data type: #{mysql_type}"
|
199
|
+
end
|
200
|
+
end
|
201
|
+
|
202
|
+
end
|
203
|
+
|
data/lib/sql2avro.rb
ADDED
@@ -0,0 +1,63 @@
|
|
1
|
+
require 'open3'
|
2
|
+
require 'yaml'
|
3
|
+
require 'yajl'
|
4
|
+
require_relative 'interface/mysql'
|
5
|
+
|
6
|
+
module Sql2Avro
|
7
|
+
AVRO_TOOLS_PATH = File.expand_path('../vendor/avro-tools-1.7.4.jar', __FILE__)
|
8
|
+
|
9
|
+
|
10
|
+
# Pulls data from the given database table starting from the given id.
|
11
|
+
#
|
12
|
+
# This function creates an Avro file as a side effect, and returns {
|
13
|
+
# max_id: greatest ID that was pulled in,
|
14
|
+
# path: filepath of the resulting avroized file
|
15
|
+
# error: error message, if any; otherwise omitted
|
16
|
+
# }
|
17
|
+
#
|
18
|
+
# database_config is a hash with this form (like ActiveRecord's):
|
19
|
+
# {
|
20
|
+
# adapter: "mysql",
|
21
|
+
# host: "localhost",
|
22
|
+
# username: "myuser",
|
23
|
+
# password: "mypass",
|
24
|
+
# database: "somedatabase"
|
25
|
+
# }
|
26
|
+
#
|
27
|
+
# table is the table to pull from.
|
28
|
+
#
|
29
|
+
# min_id specifies the value of the id column from which to start.
|
30
|
+
def Sql2Avro.avroize(database_config, table, min_id)
|
31
|
+
raise "Database interface not specified." if !database_config.has_key? 'adapter'
|
32
|
+
raise "Database interface not supported: #{database_config['adapter']}" if database_config['adapter'] != 'mysql'
|
33
|
+
|
34
|
+
interface = MySql.new(database_config)
|
35
|
+
|
36
|
+
schema = Yajl::Encoder.encode(interface.schema(table))
|
37
|
+
max_id = interface.max_id(table)
|
38
|
+
|
39
|
+
date, time, zone = Time.now.utc.to_s.split
|
40
|
+
filename = "#{table}.#{date}T#{time}Z.#{min_id}.#{max_id}.avro"
|
41
|
+
|
42
|
+
retval = {
|
43
|
+
max_id: max_id,
|
44
|
+
path: filename
|
45
|
+
}
|
46
|
+
|
47
|
+
begin
|
48
|
+
Open3.popen3("java -jar #{AVRO_TOOLS_PATH} fromjson --codec snappy --schema '#{schema}' /dev/stdin > #{filename}") do |stdin, stdout, stderr, wait_thr|
|
49
|
+
$stdout = stdout
|
50
|
+
|
51
|
+
interface.data(table, min_id, max_id).each do |datum|
|
52
|
+
Yajl::Encoder.encode(datum, stdin)
|
53
|
+
stdin.write "\n"
|
54
|
+
end
|
55
|
+
end
|
56
|
+
rescue
|
57
|
+
retval[:error] = $!
|
58
|
+
end
|
59
|
+
|
60
|
+
retval
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
Binary file
|
metadata
ADDED
@@ -0,0 +1,71 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: sql2avro
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Mason Simon
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2013-06-20 00:00:00.000000000Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: yajl-ruby
|
16
|
+
requirement: &70282006753460 !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: *70282006753460
|
25
|
+
description: sql2avro extracts data from a specified SQL database table and transforms
|
26
|
+
it into an Avro file with a schema based on the database table's schema. The intended
|
27
|
+
use case is to incrementally load data out of an SQL database and into HDFS for
|
28
|
+
analysis via Hadoop.
|
29
|
+
email:
|
30
|
+
- mason@verbasoftware.com
|
31
|
+
executables: []
|
32
|
+
extensions: []
|
33
|
+
extra_rdoc_files: []
|
34
|
+
files:
|
35
|
+
- lib/interface/interface.rb
|
36
|
+
- lib/interface/mysql.rb
|
37
|
+
- lib/sql2avro.rb
|
38
|
+
- vendor/avro-tools-1.7.4.jar
|
39
|
+
- Makefile
|
40
|
+
homepage: https://github.com/Verba/sql2avro
|
41
|
+
licenses:
|
42
|
+
- Apache 2.0
|
43
|
+
post_install_message:
|
44
|
+
rdoc_options: []
|
45
|
+
require_paths:
|
46
|
+
- lib
|
47
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
48
|
+
none: false
|
49
|
+
requirements:
|
50
|
+
- - ! '>='
|
51
|
+
- !ruby/object:Gem::Version
|
52
|
+
version: '0'
|
53
|
+
segments:
|
54
|
+
- 0
|
55
|
+
hash: -1867060671403551677
|
56
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
segments:
|
63
|
+
- 0
|
64
|
+
hash: -1867060671403551677
|
65
|
+
requirements: []
|
66
|
+
rubyforge_project:
|
67
|
+
rubygems_version: 1.8.10
|
68
|
+
signing_key:
|
69
|
+
specification_version: 3
|
70
|
+
summary: Tool for pulling data from SQL database tables into Avro files.
|
71
|
+
test_files: []
|