sql2avro 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Makefile ADDED
@@ -0,0 +1,6 @@
1
+ vendor/avro-tools-1.7.4.jar:
2
+ curl http://www.us.apache.org/dist/avro/avro-1.7.4/java/avro-tools-1.7.4.jar > vendor/$@
3
+
4
+ sql2avro-0.1.0.gem: sql2avro.gemspec
5
+ bundle exec gem build $<
6
+
@@ -0,0 +1,10 @@
1
+ class DbInterface
2
+ def schema(table)
3
+ raise "Return Avro JSON schema for #{table}"
4
+ end
5
+
6
+ def data(table)
7
+ raise "Return Avro JSON data for #{table}"
8
+ end
9
+ end
10
+
@@ -0,0 +1,203 @@
1
+ require_relative './interface'
2
+ require 'open3'
3
+
4
+ class MySql < DbInterface
5
+ MYSQL_BATCH_SEP = "\t"
6
+
7
+ # config is a hash with this form (like ActiveRecord's):
8
+ # {
9
+ # host: "localhost",
10
+ # username: "myuser",
11
+ # password: "mypass",
12
+ # database: "somedatabase"
13
+ # }
14
+ #
15
+ def initialize(config)
16
+ @db_host = config['host']
17
+ @db_name = config['database']
18
+ @username = config['username']
19
+ @password = config['password']
20
+ end
21
+
22
+ def schema(table)
23
+ types = avro_types(table)
24
+
25
+ schema = {
26
+ type: "record",
27
+ name: table,
28
+ fields: []
29
+ }
30
+
31
+ types.each do |k,v|
32
+ schema[:fields] << { name: k, type: ['null', v] }
33
+ end
34
+
35
+ schema
36
+ end
37
+
38
+ def max_id(table)
39
+ header_seen = false
40
+ query("SELECT MAX(id) FROM #{table}") do |line|
41
+ unless header_seen
42
+ header_seen = true
43
+ next
44
+ end
45
+
46
+ return line.first.to_i
47
+ end
48
+ end
49
+
50
+ def data(table, min_id, max_id)
51
+ columns = nil
52
+ rows = []
53
+
54
+ types = avro_types(table)
55
+
56
+ sql = """
57
+ SELECT *
58
+ FROM #{table}
59
+ WHERE id >= #{min_id}
60
+ AND id <= #{max_id}
61
+ """
62
+ query(sql) do |line|
63
+ # Get header.
64
+ if columns.nil?
65
+ columns = line
66
+ next
67
+ end
68
+
69
+ # Construct row mapping column names to values of appropriate type.
70
+ row = (0...columns.length).each_with_object({}) do |i, h|
71
+ colname = columns[i]
72
+ value = line[i]
73
+
74
+ # NOTE: all non-null type values are wrapped in a mapping from type to value,
75
+ # because that's what the Avro spec requires; see:
76
+ # - http://avro.apache.org/docs/current/spec.html#json_encoding
77
+ # - http://mail-archives.apache.org/mod_mbox/avro-user/201304.mbox/%3CCD86687D.E892E%25scott@richrelevance.com%3E
78
+
79
+ # Handle nulls.
80
+ if value == "NULL"
81
+ h[columns[i]] = nil
82
+ next
83
+ end
84
+
85
+ # Perform any necessary typecasts.
86
+ type = types[colname]
87
+ h[colname] = case type
88
+ when 'boolean'
89
+ { type => value.to_i.zero? }
90
+ when 'int','long'
91
+ { type => value.to_i }
92
+ when 'float','double'
93
+ { type => value.to_f }
94
+ when 'bytes'
95
+ { type => value }
96
+ when 'string'
97
+ { type => value }
98
+ else
99
+ raise "Unsupported type: #{type}"
100
+ end
101
+ end
102
+
103
+ rows << row
104
+ end
105
+
106
+ # TODO: stream this data out rather than return all in one batch.
107
+ rows
108
+ end
109
+
110
+ def sql_schema(table)
111
+ header_seen = false
112
+ columns = {}
113
+
114
+ query("DESCRIBE #{table}") do |line|
115
+ if header_seen == false
116
+ header_seen = true
117
+ next
118
+ end
119
+
120
+ name, type = line[0], line[1]
121
+ columns[name] = type
122
+ end
123
+
124
+ columns
125
+ end
126
+
127
+ def avro_types(table)
128
+ mysql_types = sql_schema(table)
129
+
130
+ types = {}
131
+ mysql_types.each do |k,v|
132
+ types[k] = MySql.avro_type(v)
133
+ end
134
+
135
+ types
136
+ end
137
+
138
+ def query(sql, &block)
139
+ MySql.query(sql, @db_host, @db_name, @username, @password, &block)
140
+ end
141
+
142
+ def self.query(sql, db_host, db_name, username, password, &block)
143
+ cmd = %{
144
+ mysql \\
145
+ --batch \\
146
+ --execute="#{sql}" \\
147
+ --host #{db_host} \\
148
+ --user #{username} \\
149
+ --password=#{password} \\
150
+ --quick \\
151
+ #{db_name}
152
+ }
153
+
154
+ Open3.popen3(cmd) do |stdin, stdout, stderr|
155
+ while (line = stdout.gets)
156
+ block.call(line.chop.split(MYSQL_BATCH_SEP))
157
+ end
158
+ end
159
+ end
160
+
161
+ def self.avro_type(mysql_type)
162
+ # Refer to https://github.com/apache/sqoop/blob/trunk/src/java/org/apache/sqoop/manager/ConnManager.java#L172.
163
+
164
+ case mysql_type
165
+
166
+ # See https://dev.mysql.com/doc/refman/5.0/en/numeric-type-overview.html
167
+ when /tinyint\(1\)/, /bool/, /boolean/
168
+ 'boolean'
169
+ when /tinyint/, /smallint/, /mediumint/, /integer/, /int/
170
+ 'int'
171
+ when /bigint/, /serial/
172
+ 'long'
173
+ when /decimal/, /dec/
174
+ 'string'
175
+ when /float/
176
+ 'float'
177
+ when /double/
178
+ 'double'
179
+ when /varchar\(\d+\)/
180
+ 'string'
181
+
182
+ # See https://dev.mysql.com/doc/refman/5.0/en/date-and-time-type-overview.html.
183
+ when /date/, /datetime/, /time/, /timestamp/
184
+ 'string'
185
+ when /year/
186
+ 'int'
187
+
188
+ # See https://dev.mysql.com/doc/refman/5.0/en/string-type-overview.html.
189
+ when /char/, /varchar/
190
+ 'string'
191
+ when /binary/, /varbinary/
192
+ 'bytes'
193
+ when /tinytext/, /text/, /longtext/
194
+ 'string'
195
+ when /tinyblob/, /blob/, /longblob/
196
+ 'bytes'
197
+ else
198
+ raise "Unsupported MySQL data type: #{mysql_type}"
199
+ end
200
+ end
201
+
202
+ end
203
+
data/lib/sql2avro.rb ADDED
@@ -0,0 +1,63 @@
1
+ require 'open3'
2
+ require 'yaml'
3
+ require 'yajl'
4
+ require_relative 'interface/mysql'
5
+
6
+ module Sql2Avro
7
+ AVRO_TOOLS_PATH = File.expand_path('../vendor/avro-tools-1.7.4.jar', __FILE__)
8
+
9
+
10
+ # Pulls data from the given database table starting from the given id.
11
+ #
12
+ # This function creates an Avro file as a side effect, and returns {
13
+ # max_id: greatest ID that was pulled in,
14
+ # path: filepath of the resulting avroized file
15
+ # error: error message, if any; otherwise omitted
16
+ # }
17
+ #
18
+ # database_config is a hash with this form (like ActiveRecord's):
19
+ # {
20
+ # adapter: "mysql",
21
+ # host: "localhost",
22
+ # username: "myuser",
23
+ # password: "mypass",
24
+ # database: "somedatabase"
25
+ # }
26
+ #
27
+ # table is the table to pull from.
28
+ #
29
+ # min_id specifies the value of the id column from which to start.
30
+ def Sql2Avro.avroize(database_config, table, min_id)
31
+ raise "Database interface not specified." if !database_config.has_key? 'adapter'
32
+ raise "Database interface not supported: #{database_config['adapter']}" if database_config['adapter'] != 'mysql'
33
+
34
+ interface = MySql.new(database_config)
35
+
36
+ schema = Yajl::Encoder.encode(interface.schema(table))
37
+ max_id = interface.max_id(table)
38
+
39
+ date, time, zone = Time.now.utc.to_s.split
40
+ filename = "#{table}.#{date}T#{time}Z.#{min_id}.#{max_id}.avro"
41
+
42
+ retval = {
43
+ max_id: max_id,
44
+ path: filename
45
+ }
46
+
47
+ begin
48
+ Open3.popen3("java -jar #{AVRO_TOOLS_PATH} fromjson --codec snappy --schema '#{schema}' /dev/stdin > #{filename}") do |stdin, stdout, stderr, wait_thr|
49
+ $stdout = stdout
50
+
51
+ interface.data(table, min_id, max_id).each do |datum|
52
+ Yajl::Encoder.encode(datum, stdin)
53
+ stdin.write "\n"
54
+ end
55
+ end
56
+ rescue
57
+ retval[:error] = $!
58
+ end
59
+
60
+ retval
61
+ end
62
+ end
63
+
Binary file
metadata ADDED
@@ -0,0 +1,71 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: sql2avro
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Mason Simon
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-06-20 00:00:00.000000000Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: yajl-ruby
16
+ requirement: &70282006753460 !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: *70282006753460
25
+ description: sql2avro extracts data from a specified SQL database table and transforms
26
+ it into an Avro file with a schema based on the database table's schema. The intended
27
+ use case is to incrementally load data out of an SQL database and into HDFS for
28
+ analysis via Hadoop.
29
+ email:
30
+ - mason@verbasoftware.com
31
+ executables: []
32
+ extensions: []
33
+ extra_rdoc_files: []
34
+ files:
35
+ - lib/interface/interface.rb
36
+ - lib/interface/mysql.rb
37
+ - lib/sql2avro.rb
38
+ - vendor/avro-tools-1.7.4.jar
39
+ - Makefile
40
+ homepage: https://github.com/Verba/sql2avro
41
+ licenses:
42
+ - Apache 2.0
43
+ post_install_message:
44
+ rdoc_options: []
45
+ require_paths:
46
+ - lib
47
+ required_ruby_version: !ruby/object:Gem::Requirement
48
+ none: false
49
+ requirements:
50
+ - - ! '>='
51
+ - !ruby/object:Gem::Version
52
+ version: '0'
53
+ segments:
54
+ - 0
55
+ hash: -1867060671403551677
56
+ required_rubygems_version: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ segments:
63
+ - 0
64
+ hash: -1867060671403551677
65
+ requirements: []
66
+ rubyforge_project:
67
+ rubygems_version: 1.8.10
68
+ signing_key:
69
+ specification_version: 3
70
+ summary: Tool for pulling data from SQL database tables into Avro files.
71
+ test_files: []