avro-salsify-fork 1.9.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/CHANGELOG +1 -0
- data/LICENSE +203 -0
- data/Manifest +31 -0
- data/NOTICE +6 -0
- data/Rakefile +66 -0
- data/avro-salsify-fork.gemspec +35 -0
- data/avro.gemspec +35 -0
- data/interop/test_interop.rb +41 -0
- data/lib/avro.rb +42 -0
- data/lib/avro/data_file.rb +366 -0
- data/lib/avro/io.rb +619 -0
- data/lib/avro/ipc.rb +551 -0
- data/lib/avro/logical_types.rb +84 -0
- data/lib/avro/protocol.rb +161 -0
- data/lib/avro/schema.rb +434 -0
- data/lib/avro/schema_normalization.rb +83 -0
- data/test/case_finder.rb +87 -0
- data/test/random_data.rb +90 -0
- data/test/sample_ipc_client.rb +85 -0
- data/test/sample_ipc_http_client.rb +84 -0
- data/test/sample_ipc_http_server.rb +79 -0
- data/test/sample_ipc_server.rb +92 -0
- data/test/test_datafile.rb +214 -0
- data/test/test_fingerprints.rb +37 -0
- data/test/test_help.rb +23 -0
- data/test/test_io.rb +451 -0
- data/test/test_logical_types.rb +111 -0
- data/test/test_protocol.rb +199 -0
- data/test/test_schema.rb +146 -0
- data/test/test_schema_normalization.rb +171 -0
- data/test/test_socket_transport.rb +40 -0
- data/test/tool.rb +144 -0
- metadata +114 -0
@@ -0,0 +1,366 @@
|
|
1
|
+
# Licensed to the Apache Software Foundation (ASF) under one
|
2
|
+
# or more contributor license agreements. See the NOTICE file
|
3
|
+
# distributed with this work for additional information
|
4
|
+
# regarding copyright ownership. The ASF licenses this file
|
5
|
+
# to you under the Apache License, Version 2.0 (the
|
6
|
+
# "License"); you may not use this file except in compliance
|
7
|
+
# with the License. You may obtain a copy of the License at
|
8
|
+
#
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
10
|
+
#
|
11
|
+
# Unless required by applicable law or agreed to in writing, software
|
12
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
13
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
14
|
+
# See the License for the specific language governing permissions and
|
15
|
+
# limitations under the License.
|
16
|
+
|
17
|
+
require 'openssl'
|
18
|
+
|
19
|
+
module Avro
|
20
|
+
module DataFile
|
21
|
+
VERSION = 1
|
22
|
+
MAGIC = "Obj" + [VERSION].pack('c')
|
23
|
+
MAGIC.force_encoding('BINARY') if MAGIC.respond_to?(:force_encoding)
|
24
|
+
MAGIC_SIZE = MAGIC.respond_to?(:bytesize) ? MAGIC.bytesize : MAGIC.size
|
25
|
+
SYNC_SIZE = 16
|
26
|
+
SYNC_INTERVAL = 4000 * SYNC_SIZE
|
27
|
+
META_SCHEMA = Schema.parse('{"type": "map", "values": "bytes"}')
|
28
|
+
VALID_ENCODINGS = ['binary'] # not used yet
|
29
|
+
|
30
|
+
class DataFileError < AvroError; end
|
31
|
+
|
32
|
+
def self.open(file_path, mode='r', schema=nil, codec=nil)
|
33
|
+
schema = Avro::Schema.parse(schema) if schema
|
34
|
+
case mode
|
35
|
+
when 'w'
|
36
|
+
unless schema
|
37
|
+
raise DataFileError, "Writing an Avro file requires a schema."
|
38
|
+
end
|
39
|
+
io = open_writer(File.open(file_path, 'wb'), schema, codec)
|
40
|
+
when 'r'
|
41
|
+
io = open_reader(File.open(file_path, 'rb'), schema)
|
42
|
+
else
|
43
|
+
raise DataFileError, "Only modes 'r' and 'w' allowed. You gave #{mode.inspect}."
|
44
|
+
end
|
45
|
+
|
46
|
+
yield io if block_given?
|
47
|
+
io
|
48
|
+
ensure
|
49
|
+
io.close if block_given? && io
|
50
|
+
end
|
51
|
+
|
52
|
+
def self.codecs
|
53
|
+
@codecs
|
54
|
+
end
|
55
|
+
|
56
|
+
def self.register_codec(codec)
|
57
|
+
@codecs ||= {}
|
58
|
+
codec = codec.new if !codec.respond_to?(:codec_name) && codec.is_a?(Class)
|
59
|
+
@codecs[codec.codec_name.to_s] = codec
|
60
|
+
end
|
61
|
+
|
62
|
+
def self.get_codec(codec)
|
63
|
+
codec ||= 'null'
|
64
|
+
if codec.respond_to?(:compress) && codec.respond_to?(:decompress)
|
65
|
+
codec # it's a codec instance
|
66
|
+
elsif codec.is_a?(Class)
|
67
|
+
codec.new # it's a codec class
|
68
|
+
elsif @codecs.include?(codec.to_s)
|
69
|
+
@codecs[codec.to_s] # it's a string or symbol (codec name)
|
70
|
+
else
|
71
|
+
raise DataFileError, "Unknown codec: #{codec.inspect}"
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
class << self
|
76
|
+
private
|
77
|
+
def open_writer(file, schema, codec=nil)
|
78
|
+
writer = Avro::IO::DatumWriter.new(schema)
|
79
|
+
Avro::DataFile::Writer.new(file, writer, schema, codec)
|
80
|
+
end
|
81
|
+
|
82
|
+
def open_reader(file, schema)
|
83
|
+
reader = Avro::IO::DatumReader.new(nil, schema)
|
84
|
+
Avro::DataFile::Reader.new(file, reader)
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
class Writer
|
89
|
+
def self.generate_sync_marker
|
90
|
+
OpenSSL::Random.random_bytes(16)
|
91
|
+
end
|
92
|
+
|
93
|
+
attr_reader :writer, :encoder, :datum_writer, :buffer_writer, :buffer_encoder, :sync_marker, :meta, :codec
|
94
|
+
attr_accessor :block_count
|
95
|
+
|
96
|
+
def initialize(writer, datum_writer, writers_schema=nil, codec=nil, meta={})
|
97
|
+
# If writers_schema is not present, presume we're appending
|
98
|
+
@writer = writer
|
99
|
+
@encoder = IO::BinaryEncoder.new(@writer)
|
100
|
+
@datum_writer = datum_writer
|
101
|
+
@meta = meta
|
102
|
+
@buffer_writer = StringIO.new('', 'w')
|
103
|
+
@buffer_writer.set_encoding('BINARY') if @buffer_writer.respond_to?(:set_encoding)
|
104
|
+
@buffer_encoder = IO::BinaryEncoder.new(@buffer_writer)
|
105
|
+
@block_count = 0
|
106
|
+
|
107
|
+
if writers_schema
|
108
|
+
@sync_marker = Writer.generate_sync_marker
|
109
|
+
@codec = DataFile.get_codec(codec)
|
110
|
+
@meta['avro.codec'] = @codec.codec_name.to_s
|
111
|
+
@meta['avro.schema'] = writers_schema.to_s
|
112
|
+
datum_writer.writers_schema = writers_schema
|
113
|
+
write_header
|
114
|
+
else
|
115
|
+
# open writer for reading to collect metadata
|
116
|
+
dfr = Reader.new(writer, Avro::IO::DatumReader.new)
|
117
|
+
|
118
|
+
# FIXME(jmhodges): collect arbitrary metadata
|
119
|
+
# collect metadata
|
120
|
+
@sync_marker = dfr.sync_marker
|
121
|
+
@meta['avro.codec'] = dfr.meta['avro.codec']
|
122
|
+
@codec = DataFile.get_codec(meta['avro.codec'])
|
123
|
+
|
124
|
+
# get schema used to write existing file
|
125
|
+
schema_from_file = dfr.meta['avro.schema']
|
126
|
+
@meta['avro.schema'] = schema_from_file
|
127
|
+
datum_writer.writers_schema = Schema.parse(schema_from_file)
|
128
|
+
|
129
|
+
# seek to the end of the file and prepare for writing
|
130
|
+
writer.seek(0,2)
|
131
|
+
end
|
132
|
+
end
|
133
|
+
|
134
|
+
# Append a datum to the file
|
135
|
+
def <<(datum)
|
136
|
+
datum_writer.write(datum, buffer_encoder)
|
137
|
+
self.block_count += 1
|
138
|
+
|
139
|
+
# if the data to write is larger than the sync interval, write
|
140
|
+
# the block
|
141
|
+
if buffer_writer.tell >= SYNC_INTERVAL
|
142
|
+
write_block
|
143
|
+
end
|
144
|
+
end
|
145
|
+
|
146
|
+
# Return the current position as a value that may be passed to
|
147
|
+
# DataFileReader.seek(long). Forces the end of the current block,
|
148
|
+
# emitting a synchronization marker.
|
149
|
+
def sync
|
150
|
+
write_block
|
151
|
+
writer.tell
|
152
|
+
end
|
153
|
+
|
154
|
+
# Flush the current state of the file, including metadata
|
155
|
+
def flush
|
156
|
+
write_block
|
157
|
+
writer.flush
|
158
|
+
end
|
159
|
+
|
160
|
+
def close
|
161
|
+
flush
|
162
|
+
writer.close
|
163
|
+
end
|
164
|
+
|
165
|
+
private
|
166
|
+
|
167
|
+
def write_header
|
168
|
+
# write magic
|
169
|
+
writer.write(MAGIC)
|
170
|
+
|
171
|
+
# write metadata
|
172
|
+
datum_writer.write_data(META_SCHEMA, meta, encoder)
|
173
|
+
|
174
|
+
# write sync marker
|
175
|
+
writer.write(sync_marker)
|
176
|
+
end
|
177
|
+
|
178
|
+
# TODO(jmhodges): make a schema for blocks and use datum_writer
|
179
|
+
# TODO(jmhodges): do we really need the number of items in the block?
|
180
|
+
def write_block
|
181
|
+
if block_count > 0
|
182
|
+
# write number of items in block and block size in bytes
|
183
|
+
encoder.write_long(block_count)
|
184
|
+
to_write = codec.compress(buffer_writer.string)
|
185
|
+
encoder.write_long(to_write.respond_to?(:bytesize) ? to_write.bytesize : to_write.size)
|
186
|
+
|
187
|
+
# write block contents
|
188
|
+
writer.write(to_write)
|
189
|
+
|
190
|
+
# write sync marker
|
191
|
+
writer.write(sync_marker)
|
192
|
+
|
193
|
+
# reset buffer
|
194
|
+
buffer_writer.truncate(0)
|
195
|
+
buffer_writer.rewind
|
196
|
+
self.block_count = 0
|
197
|
+
end
|
198
|
+
end
|
199
|
+
end
|
200
|
+
|
201
|
+
# Read files written by DataFileWriter
|
202
|
+
class Reader
|
203
|
+
include ::Enumerable
|
204
|
+
|
205
|
+
# The reader and binary decoder for the raw file stream
|
206
|
+
attr_reader :reader, :decoder
|
207
|
+
|
208
|
+
# The binary decoder for the contents of a block (after codec decompression)
|
209
|
+
attr_reader :block_decoder
|
210
|
+
|
211
|
+
attr_reader :datum_reader, :sync_marker, :meta, :file_length, :codec
|
212
|
+
attr_accessor :block_count # records remaining in current block
|
213
|
+
|
214
|
+
def initialize(reader, datum_reader)
|
215
|
+
@reader = reader
|
216
|
+
@decoder = IO::BinaryDecoder.new(reader)
|
217
|
+
@datum_reader = datum_reader
|
218
|
+
|
219
|
+
# read the header: magic, meta, sync
|
220
|
+
read_header
|
221
|
+
|
222
|
+
@codec = DataFile.get_codec(meta['avro.codec'])
|
223
|
+
|
224
|
+
# get ready to read
|
225
|
+
@block_count = 0
|
226
|
+
datum_reader.writers_schema = Schema.parse meta['avro.schema']
|
227
|
+
end
|
228
|
+
|
229
|
+
# Iterates through each datum in this file
|
230
|
+
# TODO(jmhodges): handle block of length zero
|
231
|
+
def each
|
232
|
+
loop do
|
233
|
+
if block_count == 0
|
234
|
+
case
|
235
|
+
when eof?; break
|
236
|
+
when skip_sync
|
237
|
+
break if eof?
|
238
|
+
read_block_header
|
239
|
+
else
|
240
|
+
read_block_header
|
241
|
+
end
|
242
|
+
end
|
243
|
+
|
244
|
+
datum = datum_reader.read(block_decoder)
|
245
|
+
self.block_count -= 1
|
246
|
+
yield(datum)
|
247
|
+
end
|
248
|
+
end
|
249
|
+
|
250
|
+
def eof?; reader.eof?; end
|
251
|
+
|
252
|
+
def close
|
253
|
+
reader.close
|
254
|
+
end
|
255
|
+
|
256
|
+
private
|
257
|
+
def read_header
|
258
|
+
# seek to the beginning of the file to get magic block
|
259
|
+
reader.seek(0, 0)
|
260
|
+
|
261
|
+
# check magic number
|
262
|
+
magic_in_file = reader.read(MAGIC_SIZE)
|
263
|
+
if magic_in_file.size < MAGIC_SIZE
|
264
|
+
msg = 'Not an Avro data file: shorter than the Avro magic block'
|
265
|
+
raise DataFileError, msg
|
266
|
+
elsif magic_in_file != MAGIC
|
267
|
+
msg = "Not an Avro data file: #{magic_in_file.inspect} doesn't match #{MAGIC.inspect}"
|
268
|
+
raise DataFileError, msg
|
269
|
+
end
|
270
|
+
|
271
|
+
# read metadata
|
272
|
+
@meta = datum_reader.read_data(META_SCHEMA,
|
273
|
+
META_SCHEMA,
|
274
|
+
decoder)
|
275
|
+
# read sync marker
|
276
|
+
@sync_marker = reader.read(SYNC_SIZE)
|
277
|
+
end
|
278
|
+
|
279
|
+
def read_block_header
|
280
|
+
self.block_count = decoder.read_long
|
281
|
+
block_bytes = decoder.read_long
|
282
|
+
data = codec.decompress(reader.read(block_bytes))
|
283
|
+
@block_decoder = IO::BinaryDecoder.new(StringIO.new(data))
|
284
|
+
end
|
285
|
+
|
286
|
+
# read the length of the sync marker; if it matches the sync
|
287
|
+
# marker, return true. Otherwise, seek back to where we started
|
288
|
+
# and return false
|
289
|
+
def skip_sync
|
290
|
+
proposed_sync_marker = reader.read(SYNC_SIZE)
|
291
|
+
if proposed_sync_marker != sync_marker
|
292
|
+
reader.seek(-SYNC_SIZE, 1)
|
293
|
+
false
|
294
|
+
else
|
295
|
+
true
|
296
|
+
end
|
297
|
+
end
|
298
|
+
end
|
299
|
+
|
300
|
+
|
301
|
+
class NullCodec
|
302
|
+
def codec_name; 'null'; end
|
303
|
+
def decompress(data); data; end
|
304
|
+
def compress(data); data; end
|
305
|
+
end
|
306
|
+
|
307
|
+
class DeflateCodec
|
308
|
+
attr_reader :level
|
309
|
+
|
310
|
+
def initialize(level=Zlib::DEFAULT_COMPRESSION)
|
311
|
+
@level = level
|
312
|
+
end
|
313
|
+
|
314
|
+
def codec_name; 'deflate'; end
|
315
|
+
|
316
|
+
def decompress(compressed)
|
317
|
+
# Passing a negative number to Inflate puts it into "raw" RFC1951 mode
|
318
|
+
# (without the RFC1950 header & checksum). See the docs for
|
319
|
+
# inflateInit2 in http://www.zlib.net/manual.html
|
320
|
+
zstream = Zlib::Inflate.new(-Zlib::MAX_WBITS)
|
321
|
+
data = zstream.inflate(compressed)
|
322
|
+
data << zstream.finish
|
323
|
+
ensure
|
324
|
+
zstream.close
|
325
|
+
end
|
326
|
+
|
327
|
+
def compress(data)
|
328
|
+
zstream = Zlib::Deflate.new(level, -Zlib::MAX_WBITS)
|
329
|
+
compressed = zstream.deflate(data)
|
330
|
+
compressed << zstream.finish
|
331
|
+
ensure
|
332
|
+
zstream.close
|
333
|
+
end
|
334
|
+
end
|
335
|
+
|
336
|
+
class SnappyCodec
|
337
|
+
def codec_name; 'snappy'; end
|
338
|
+
|
339
|
+
def decompress(data)
|
340
|
+
load_snappy!
|
341
|
+
Snappy.inflate(data)
|
342
|
+
end
|
343
|
+
|
344
|
+
def compress(data)
|
345
|
+
load_snappy!
|
346
|
+
Snappy.deflate(data)
|
347
|
+
end
|
348
|
+
|
349
|
+
private
|
350
|
+
|
351
|
+
def load_snappy!
|
352
|
+
require 'snappy' unless defined?(Snappy)
|
353
|
+
rescue LoadError
|
354
|
+
raise LoadError, "Snappy compression is not available, please install the `snappy` gem."
|
355
|
+
end
|
356
|
+
end
|
357
|
+
|
358
|
+
DataFile.register_codec NullCodec
|
359
|
+
DataFile.register_codec DeflateCodec
|
360
|
+
DataFile.register_codec SnappyCodec
|
361
|
+
|
362
|
+
# TODO this constant won't be updated if you register another codec.
|
363
|
+
# Deprecated in favor of Avro::DataFile::codecs
|
364
|
+
VALID_CODECS = DataFile.codecs.keys
|
365
|
+
end
|
366
|
+
end
|
data/lib/avro/io.rb
ADDED
@@ -0,0 +1,619 @@
|
|
1
|
+
# Licensed to the Apache Software Foundation (ASF) under one
|
2
|
+
# or more contributor license agreements. See the NOTICE file
|
3
|
+
# distributed with this work for additional information
|
4
|
+
# regarding copyright ownership. The ASF licenses this file
|
5
|
+
# to you under the Apache License, Version 2.0 (the
|
6
|
+
# "License"); you may not use this file except in compliance
|
7
|
+
# with the License. You may obtain a copy of the License at
|
8
|
+
#
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
10
|
+
#
|
11
|
+
# Unless required by applicable law or agreed to in writing, software
|
12
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
13
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
14
|
+
# See the License for the specific language governing permissions and
|
15
|
+
# limitations under the License.
|
16
|
+
|
17
|
+
module Avro
|
18
|
+
module IO
|
19
|
+
# Raised when datum is not an example of schema
|
20
|
+
class AvroTypeError < AvroError
|
21
|
+
def initialize(expected_schema, datum)
|
22
|
+
super("The datum #{datum.inspect} is not an example of schema #{expected_schema}")
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
# Raised when writer's and reader's schema do not match
|
27
|
+
class SchemaMatchException < AvroError
|
28
|
+
def initialize(writers_schema, readers_schema)
|
29
|
+
super("Writer's schema #{writers_schema} and Reader's schema " +
|
30
|
+
"#{readers_schema} do not match.")
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
# FIXME(jmhodges) move validate to this module?
|
35
|
+
|
36
|
+
class BinaryDecoder
|
37
|
+
# Read leaf values
|
38
|
+
|
39
|
+
# reader is an object on which we can call read, seek and tell.
|
40
|
+
attr_reader :reader
|
41
|
+
def initialize(reader)
|
42
|
+
@reader = reader
|
43
|
+
end
|
44
|
+
|
45
|
+
def byte!
|
46
|
+
@reader.read(1).unpack('C').first
|
47
|
+
end
|
48
|
+
|
49
|
+
def read_null
|
50
|
+
# null is written as zero byte's
|
51
|
+
nil
|
52
|
+
end
|
53
|
+
|
54
|
+
def read_boolean
|
55
|
+
byte! == 1
|
56
|
+
end
|
57
|
+
|
58
|
+
def read_int; read_long; end
|
59
|
+
|
60
|
+
def read_long
|
61
|
+
# int and long values are written using variable-length,
|
62
|
+
# zig-zag coding.
|
63
|
+
b = byte!
|
64
|
+
n = b & 0x7F
|
65
|
+
shift = 7
|
66
|
+
while (b & 0x80) != 0
|
67
|
+
b = byte!
|
68
|
+
n |= (b & 0x7F) << shift
|
69
|
+
shift += 7
|
70
|
+
end
|
71
|
+
(n >> 1) ^ -(n & 1)
|
72
|
+
end
|
73
|
+
|
74
|
+
def read_float
|
75
|
+
# A float is written as 4 bytes.
|
76
|
+
# The float is converted into a 32-bit integer using a method
|
77
|
+
# equivalent to Java's floatToIntBits and then encoded in
|
78
|
+
# little-endian format.
|
79
|
+
@reader.read(4).unpack('e')[0]
|
80
|
+
end
|
81
|
+
|
82
|
+
def read_double
|
83
|
+
# A double is written as 8 bytes.
|
84
|
+
# The double is converted into a 64-bit integer using a method
|
85
|
+
# equivalent to Java's doubleToLongBits and then encoded in
|
86
|
+
# little-endian format.
|
87
|
+
@reader.read(8).unpack('E')[0]
|
88
|
+
end
|
89
|
+
|
90
|
+
def read_bytes
|
91
|
+
# Bytes are encoded as a long followed by that many bytes of
|
92
|
+
# data.
|
93
|
+
read(read_long)
|
94
|
+
end
|
95
|
+
|
96
|
+
def read_string
|
97
|
+
# A string is encoded as a long followed by that many bytes of
|
98
|
+
# UTF-8 encoded character data.
|
99
|
+
read_bytes.tap do |string|
|
100
|
+
string.force_encoding("UTF-8") if string.respond_to? :force_encoding
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
def read(len)
|
105
|
+
# Read n bytes
|
106
|
+
@reader.read(len)
|
107
|
+
end
|
108
|
+
|
109
|
+
def skip_null
|
110
|
+
nil
|
111
|
+
end
|
112
|
+
|
113
|
+
def skip_boolean
|
114
|
+
skip(1)
|
115
|
+
end
|
116
|
+
|
117
|
+
def skip_int
|
118
|
+
skip_long
|
119
|
+
end
|
120
|
+
|
121
|
+
def skip_long
|
122
|
+
b = byte!
|
123
|
+
while (b & 0x80) != 0
|
124
|
+
b = byte!
|
125
|
+
end
|
126
|
+
end
|
127
|
+
|
128
|
+
def skip_float
|
129
|
+
skip(4)
|
130
|
+
end
|
131
|
+
|
132
|
+
def skip_double
|
133
|
+
skip(8)
|
134
|
+
end
|
135
|
+
|
136
|
+
def skip_bytes
|
137
|
+
skip(read_long)
|
138
|
+
end
|
139
|
+
|
140
|
+
def skip_string
|
141
|
+
skip_bytes
|
142
|
+
end
|
143
|
+
|
144
|
+
def skip(n)
|
145
|
+
reader.seek(reader.tell() + n)
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
149
|
+
# Write leaf values
|
150
|
+
class BinaryEncoder
|
151
|
+
attr_reader :writer
|
152
|
+
|
153
|
+
def initialize(writer)
|
154
|
+
@writer = writer
|
155
|
+
end
|
156
|
+
|
157
|
+
# null is written as zero bytes
|
158
|
+
def write_null(datum)
|
159
|
+
nil
|
160
|
+
end
|
161
|
+
|
162
|
+
# a boolean is written as a single byte
|
163
|
+
# whose value is either 0 (false) or 1 (true).
|
164
|
+
def write_boolean(datum)
|
165
|
+
on_disk = datum ? 1.chr : 0.chr
|
166
|
+
writer.write(on_disk)
|
167
|
+
end
|
168
|
+
|
169
|
+
# int and long values are written using variable-length,
|
170
|
+
# zig-zag coding.
|
171
|
+
def write_int(n)
|
172
|
+
write_long(n)
|
173
|
+
end
|
174
|
+
|
175
|
+
# int and long values are written using variable-length,
|
176
|
+
# zig-zag coding.
|
177
|
+
def write_long(n)
|
178
|
+
foo = n
|
179
|
+
n = (n << 1) ^ (n >> 63)
|
180
|
+
while (n & ~0x7F) != 0
|
181
|
+
@writer.write(((n & 0x7f) | 0x80).chr)
|
182
|
+
n >>= 7
|
183
|
+
end
|
184
|
+
@writer.write(n.chr)
|
185
|
+
end
|
186
|
+
|
187
|
+
# A float is written as 4 bytes.
|
188
|
+
# The float is converted into a 32-bit integer using a method
|
189
|
+
# equivalent to Java's floatToIntBits and then encoded in
|
190
|
+
# little-endian format.
|
191
|
+
def write_float(datum)
|
192
|
+
@writer.write([datum].pack('e'))
|
193
|
+
end
|
194
|
+
|
195
|
+
# A double is written as 8 bytes.
|
196
|
+
# The double is converted into a 64-bit integer using a method
|
197
|
+
# equivalent to Java's doubleToLongBits and then encoded in
|
198
|
+
# little-endian format.
|
199
|
+
def write_double(datum)
|
200
|
+
@writer.write([datum].pack('E'))
|
201
|
+
end
|
202
|
+
|
203
|
+
# Bytes are encoded as a long followed by that many bytes of data.
|
204
|
+
def write_bytes(datum)
|
205
|
+
write_long(datum.bytesize)
|
206
|
+
@writer.write(datum)
|
207
|
+
end
|
208
|
+
|
209
|
+
# A string is encoded as a long followed by that many bytes of
|
210
|
+
# UTF-8 encoded character data
|
211
|
+
def write_string(datum)
|
212
|
+
datum = datum.encode('utf-8') if datum.respond_to? :encode
|
213
|
+
write_bytes(datum)
|
214
|
+
end
|
215
|
+
|
216
|
+
# Write an arbritary datum.
|
217
|
+
def write(datum)
|
218
|
+
writer.write(datum)
|
219
|
+
end
|
220
|
+
end
|
221
|
+
|
222
|
+
class DatumReader
|
223
|
+
def self.match_schemas(writers_schema, readers_schema)
|
224
|
+
w_type = writers_schema.type_sym
|
225
|
+
r_type = readers_schema.type_sym
|
226
|
+
|
227
|
+
# This conditional is begging for some OO love.
|
228
|
+
if w_type == :union || r_type == :union
|
229
|
+
return true
|
230
|
+
end
|
231
|
+
|
232
|
+
if w_type == r_type
|
233
|
+
return true if Schema::PRIMITIVE_TYPES_SYM.include?(r_type)
|
234
|
+
|
235
|
+
case r_type
|
236
|
+
when :record
|
237
|
+
return writers_schema.fullname == readers_schema.fullname
|
238
|
+
when :error
|
239
|
+
return writers_schema.fullname == readers_schema.fullname
|
240
|
+
when :request
|
241
|
+
return true
|
242
|
+
when :fixed
|
243
|
+
return writers_schema.fullname == readers_schema.fullname &&
|
244
|
+
writers_schema.size == readers_schema.size
|
245
|
+
when :enum
|
246
|
+
return writers_schema.fullname == readers_schema.fullname
|
247
|
+
when :map
|
248
|
+
return writers_schema.values.type == readers_schema.values.type
|
249
|
+
when :array
|
250
|
+
return writers_schema.items.type == readers_schema.items.type
|
251
|
+
end
|
252
|
+
end
|
253
|
+
|
254
|
+
# Handle schema promotion
|
255
|
+
if w_type == :int && [:long, :float, :double].include?(r_type)
|
256
|
+
return true
|
257
|
+
elsif w_type == :long && [:float, :double].include?(r_type)
|
258
|
+
return true
|
259
|
+
elsif w_type == :float && r_type == :double
|
260
|
+
return true
|
261
|
+
end
|
262
|
+
|
263
|
+
return false
|
264
|
+
end
|
265
|
+
|
266
|
+
attr_accessor :writers_schema, :readers_schema
|
267
|
+
|
268
|
+
def initialize(writers_schema=nil, readers_schema=nil)
|
269
|
+
@writers_schema = writers_schema
|
270
|
+
@readers_schema = readers_schema
|
271
|
+
end
|
272
|
+
|
273
|
+
def read(decoder)
|
274
|
+
self.readers_schema = writers_schema unless readers_schema
|
275
|
+
read_data(writers_schema, readers_schema, decoder)
|
276
|
+
end
|
277
|
+
|
278
|
+
def read_data(writers_schema, readers_schema, decoder)
|
279
|
+
# schema matching
|
280
|
+
unless self.class.match_schemas(writers_schema, readers_schema)
|
281
|
+
raise SchemaMatchException.new(writers_schema, readers_schema)
|
282
|
+
end
|
283
|
+
|
284
|
+
# schema resolution: reader's schema is a union, writer's
|
285
|
+
# schema is not
|
286
|
+
if writers_schema.type_sym != :union && readers_schema.type_sym == :union
|
287
|
+
rs = readers_schema.schemas.find{|s|
|
288
|
+
self.class.match_schemas(writers_schema, s)
|
289
|
+
}
|
290
|
+
return read_data(writers_schema, rs, decoder) if rs
|
291
|
+
raise SchemaMatchException.new(writers_schema, readers_schema)
|
292
|
+
end
|
293
|
+
|
294
|
+
# function dispatch for reading data based on type of writer's
|
295
|
+
# schema
|
296
|
+
datum = case writers_schema.type_sym
|
297
|
+
when :null; decoder.read_null
|
298
|
+
when :boolean; decoder.read_boolean
|
299
|
+
when :string; decoder.read_string
|
300
|
+
when :int; decoder.read_int
|
301
|
+
when :long; decoder.read_long
|
302
|
+
when :float; decoder.read_float
|
303
|
+
when :double; decoder.read_double
|
304
|
+
when :bytes; decoder.read_bytes
|
305
|
+
when :fixed; read_fixed(writers_schema, readers_schema, decoder)
|
306
|
+
when :enum; read_enum(writers_schema, readers_schema, decoder)
|
307
|
+
when :array; read_array(writers_schema, readers_schema, decoder)
|
308
|
+
when :map; read_map(writers_schema, readers_schema, decoder)
|
309
|
+
when :union; read_union(writers_schema, readers_schema, decoder)
|
310
|
+
when :record, :error, :request; read_record(writers_schema, readers_schema, decoder)
|
311
|
+
else
|
312
|
+
raise AvroError, "Cannot read unknown schema type: #{writers_schema.type}"
|
313
|
+
end
|
314
|
+
|
315
|
+
readers_schema.type_adapter.decode(datum)
|
316
|
+
end
|
317
|
+
|
318
|
+
def read_fixed(writers_schema, readers_schema, decoder)
|
319
|
+
decoder.read(writers_schema.size)
|
320
|
+
end
|
321
|
+
|
322
|
+
def read_enum(writers_schema, readers_schema, decoder)
|
323
|
+
index_of_symbol = decoder.read_int
|
324
|
+
read_symbol = writers_schema.symbols[index_of_symbol]
|
325
|
+
|
326
|
+
# TODO(jmhodges): figure out what unset means for resolution
|
327
|
+
# schema resolution
|
328
|
+
unless readers_schema.symbols.include?(read_symbol)
|
329
|
+
# 'unset' here
|
330
|
+
end
|
331
|
+
|
332
|
+
read_symbol
|
333
|
+
end
|
334
|
+
|
335
|
+
def read_array(writers_schema, readers_schema, decoder)
|
336
|
+
read_items = []
|
337
|
+
block_count = decoder.read_long
|
338
|
+
while block_count != 0
|
339
|
+
if block_count < 0
|
340
|
+
block_count = -block_count
|
341
|
+
block_size = decoder.read_long
|
342
|
+
end
|
343
|
+
block_count.times do
|
344
|
+
read_items << read_data(writers_schema.items,
|
345
|
+
readers_schema.items,
|
346
|
+
decoder)
|
347
|
+
end
|
348
|
+
block_count = decoder.read_long
|
349
|
+
end
|
350
|
+
|
351
|
+
read_items
|
352
|
+
end
|
353
|
+
|
354
|
+
def read_map(writers_schema, readers_schema, decoder)
|
355
|
+
read_items = {}
|
356
|
+
block_count = decoder.read_long
|
357
|
+
while block_count != 0
|
358
|
+
if block_count < 0
|
359
|
+
block_count = -block_count
|
360
|
+
block_size = decoder.read_long
|
361
|
+
end
|
362
|
+
block_count.times do
|
363
|
+
key = decoder.read_string
|
364
|
+
read_items[key] = read_data(writers_schema.values,
|
365
|
+
readers_schema.values,
|
366
|
+
decoder)
|
367
|
+
end
|
368
|
+
block_count = decoder.read_long
|
369
|
+
end
|
370
|
+
|
371
|
+
read_items
|
372
|
+
end
|
373
|
+
|
374
|
+
def read_union(writers_schema, readers_schema, decoder)
|
375
|
+
index_of_schema = decoder.read_long
|
376
|
+
selected_writers_schema = writers_schema.schemas[index_of_schema]
|
377
|
+
|
378
|
+
read_data(selected_writers_schema, readers_schema, decoder)
|
379
|
+
end
|
380
|
+
|
381
|
+
def read_record(writers_schema, readers_schema, decoder)
|
382
|
+
readers_fields_hash = readers_schema.fields_hash
|
383
|
+
read_record = {}
|
384
|
+
writers_schema.fields.each do |field|
|
385
|
+
if readers_field = readers_fields_hash[field.name]
|
386
|
+
field_val = read_data(field.type, readers_field.type, decoder)
|
387
|
+
read_record[field.name] = field_val
|
388
|
+
else
|
389
|
+
skip_data(field.type, decoder)
|
390
|
+
end
|
391
|
+
end
|
392
|
+
|
393
|
+
# fill in the default values
|
394
|
+
if readers_fields_hash.size > read_record.size
|
395
|
+
writers_fields_hash = writers_schema.fields_hash
|
396
|
+
readers_fields_hash.each do |field_name, field|
|
397
|
+
unless writers_fields_hash.has_key? field_name
|
398
|
+
if !field.default.nil?
|
399
|
+
field_val = read_default_value(field.type, field.default)
|
400
|
+
read_record[field.name] = field_val
|
401
|
+
else
|
402
|
+
# FIXME(jmhodges) another 'unset' here
|
403
|
+
end
|
404
|
+
end
|
405
|
+
end
|
406
|
+
end
|
407
|
+
|
408
|
+
read_record
|
409
|
+
end
|
410
|
+
|
411
|
+
def read_default_value(field_schema, default_value)
|
412
|
+
# Basically a JSON Decoder?
|
413
|
+
case field_schema.type_sym
|
414
|
+
when :null
|
415
|
+
return nil
|
416
|
+
when :boolean
|
417
|
+
return default_value
|
418
|
+
when :int, :long
|
419
|
+
return Integer(default_value)
|
420
|
+
when :float, :double
|
421
|
+
return Float(default_value)
|
422
|
+
when :enum, :fixed, :string, :bytes
|
423
|
+
return default_value
|
424
|
+
when :array
|
425
|
+
read_array = []
|
426
|
+
default_value.each do |json_val|
|
427
|
+
item_val = read_default_value(field_schema.items, json_val)
|
428
|
+
read_array << item_val
|
429
|
+
end
|
430
|
+
return read_array
|
431
|
+
when :map
|
432
|
+
read_map = {}
|
433
|
+
default_value.each do |key, json_val|
|
434
|
+
map_val = read_default_value(field_schema.values, json_val)
|
435
|
+
read_map[key] = map_val
|
436
|
+
end
|
437
|
+
return read_map
|
438
|
+
when :union
|
439
|
+
return read_default_value(field_schema.schemas[0], default_value)
|
440
|
+
when :record, :error
|
441
|
+
read_record = {}
|
442
|
+
field_schema.fields.each do |field|
|
443
|
+
json_val = default_value[field.name]
|
444
|
+
json_val = field.default unless json_val
|
445
|
+
field_val = read_default_value(field.type, json_val)
|
446
|
+
read_record[field.name] = field_val
|
447
|
+
end
|
448
|
+
return read_record
|
449
|
+
else
|
450
|
+
fail_msg = "Unknown type: #{field_schema.type}"
|
451
|
+
raise AvroError, fail_msg
|
452
|
+
end
|
453
|
+
end
|
454
|
+
|
455
|
+
def skip_data(writers_schema, decoder)
|
456
|
+
case writers_schema.type_sym
|
457
|
+
when :null
|
458
|
+
decoder.skip_null
|
459
|
+
when :boolean
|
460
|
+
decoder.skip_boolean
|
461
|
+
when :string
|
462
|
+
decoder.skip_string
|
463
|
+
when :int
|
464
|
+
decoder.skip_int
|
465
|
+
when :long
|
466
|
+
decoder.skip_long
|
467
|
+
when :float
|
468
|
+
decoder.skip_float
|
469
|
+
when :double
|
470
|
+
decoder.skip_double
|
471
|
+
when :bytes
|
472
|
+
decoder.skip_bytes
|
473
|
+
when :fixed
|
474
|
+
skip_fixed(writers_schema, decoder)
|
475
|
+
when :enum
|
476
|
+
skip_enum(writers_schema, decoder)
|
477
|
+
when :array
|
478
|
+
skip_array(writers_schema, decoder)
|
479
|
+
when :map
|
480
|
+
skip_map(writers_schema, decoder)
|
481
|
+
when :union
|
482
|
+
skip_union(writers_schema, decoder)
|
483
|
+
when :record, :error, :request
|
484
|
+
skip_record(writers_schema, decoder)
|
485
|
+
else
|
486
|
+
raise AvroError, "Unknown schema type: #{writers_schema.type}"
|
487
|
+
end
|
488
|
+
end
|
489
|
+
|
490
|
+
def skip_fixed(writers_schema, decoder)
|
491
|
+
decoder.skip(writers_schema.size)
|
492
|
+
end
|
493
|
+
|
494
|
+
def skip_enum(writers_schema, decoder)
|
495
|
+
decoder.skip_int
|
496
|
+
end
|
497
|
+
|
498
|
+
def skip_union(writers_schema, decoder)
|
499
|
+
index = decoder.read_long
|
500
|
+
skip_data(writers_schema.schemas[index], decoder)
|
501
|
+
end
|
502
|
+
|
503
|
+
def skip_array(writers_schema, decoder)
|
504
|
+
skip_blocks(decoder) { skip_data(writers_schema.items, decoder) }
|
505
|
+
end
|
506
|
+
|
507
|
+
def skip_map(writers_schema, decoder)
|
508
|
+
skip_blocks(decoder) {
|
509
|
+
decoder.skip_string
|
510
|
+
skip_data(writers_schema.values, decoder)
|
511
|
+
}
|
512
|
+
end
|
513
|
+
|
514
|
+
def skip_record(writers_schema, decoder)
|
515
|
+
writers_schema.fields.each{|f| skip_data(f.type, decoder) }
|
516
|
+
end
|
517
|
+
|
518
|
+
private
|
519
|
+
def skip_blocks(decoder, &blk)
|
520
|
+
block_count = decoder.read_long
|
521
|
+
while block_count != 0
|
522
|
+
if block_count < 0
|
523
|
+
decoder.skip(decoder.read_long)
|
524
|
+
else
|
525
|
+
block_count.times &blk
|
526
|
+
end
|
527
|
+
block_count = decoder.read_long
|
528
|
+
end
|
529
|
+
end
|
530
|
+
end # DatumReader
|
531
|
+
|
532
|
+
# DatumWriter for generic ruby objects
|
533
|
+
class DatumWriter
|
534
|
+
attr_accessor :writers_schema
|
535
|
+
def initialize(writers_schema=nil)
|
536
|
+
@writers_schema = writers_schema
|
537
|
+
end
|
538
|
+
|
539
|
+
def write(datum, encoder)
|
540
|
+
write_data(writers_schema, datum, encoder)
|
541
|
+
end
|
542
|
+
|
543
|
+
def write_data(writers_schema, logical_datum, encoder)
|
544
|
+
datum = writers_schema.type_adapter.encode(logical_datum)
|
545
|
+
|
546
|
+
unless Schema.validate(writers_schema, datum)
|
547
|
+
raise AvroTypeError.new(writers_schema, datum)
|
548
|
+
end
|
549
|
+
|
550
|
+
# function dispatch to write datum
|
551
|
+
case writers_schema.type_sym
|
552
|
+
when :null; encoder.write_null(datum)
|
553
|
+
when :boolean; encoder.write_boolean(datum)
|
554
|
+
when :string; encoder.write_string(datum)
|
555
|
+
when :int; encoder.write_int(datum)
|
556
|
+
when :long; encoder.write_long(datum)
|
557
|
+
when :float; encoder.write_float(datum)
|
558
|
+
when :double; encoder.write_double(datum)
|
559
|
+
when :bytes; encoder.write_bytes(datum)
|
560
|
+
when :fixed; write_fixed(writers_schema, datum, encoder)
|
561
|
+
when :enum; write_enum(writers_schema, datum, encoder)
|
562
|
+
when :array; write_array(writers_schema, datum, encoder)
|
563
|
+
when :map; write_map(writers_schema, datum, encoder)
|
564
|
+
when :union; write_union(writers_schema, datum, encoder)
|
565
|
+
when :record, :error, :request; write_record(writers_schema, datum, encoder)
|
566
|
+
else
|
567
|
+
raise AvroError.new("Unknown type: #{writers_schema.type}")
|
568
|
+
end
|
569
|
+
end
|
570
|
+
|
571
|
+
def write_fixed(writers_schema, datum, encoder)
|
572
|
+
encoder.write(datum)
|
573
|
+
end
|
574
|
+
|
575
|
+
def write_enum(writers_schema, datum, encoder)
|
576
|
+
index_of_datum = writers_schema.symbols.index(datum)
|
577
|
+
encoder.write_int(index_of_datum)
|
578
|
+
end
|
579
|
+
|
580
|
+
def write_array(writers_schema, datum, encoder)
|
581
|
+
if datum.size > 0
|
582
|
+
encoder.write_long(datum.size)
|
583
|
+
datum.each do |item|
|
584
|
+
write_data(writers_schema.items, item, encoder)
|
585
|
+
end
|
586
|
+
end
|
587
|
+
encoder.write_long(0)
|
588
|
+
end
|
589
|
+
|
590
|
+
def write_map(writers_schema, datum, encoder)
|
591
|
+
if datum.size > 0
|
592
|
+
encoder.write_long(datum.size)
|
593
|
+
datum.each do |k,v|
|
594
|
+
encoder.write_string(k)
|
595
|
+
write_data(writers_schema.values, v, encoder)
|
596
|
+
end
|
597
|
+
end
|
598
|
+
encoder.write_long(0)
|
599
|
+
end
|
600
|
+
|
601
|
+
def write_union(writers_schema, datum, encoder)
|
602
|
+
index_of_schema = -1
|
603
|
+
found = writers_schema.schemas.
|
604
|
+
find{|e| index_of_schema += 1; found = Schema.validate(e, datum) }
|
605
|
+
unless found # Because find_index doesn't exist in 1.8.6
|
606
|
+
raise AvroTypeError.new(writers_schema, datum)
|
607
|
+
end
|
608
|
+
encoder.write_long(index_of_schema)
|
609
|
+
write_data(writers_schema.schemas[index_of_schema], datum, encoder)
|
610
|
+
end
|
611
|
+
|
612
|
+
def write_record(writers_schema, datum, encoder)
|
613
|
+
writers_schema.fields.each do |field|
|
614
|
+
write_data(field.type, datum[field.name], encoder)
|
615
|
+
end
|
616
|
+
end
|
617
|
+
end # DatumWriter
|
618
|
+
end
|
619
|
+
end
|