avro-salsify-fork 1.9.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG +1 -0
- data/LICENSE +203 -0
- data/Manifest +31 -0
- data/NOTICE +6 -0
- data/Rakefile +66 -0
- data/avro-salsify-fork.gemspec +35 -0
- data/avro.gemspec +35 -0
- data/interop/test_interop.rb +41 -0
- data/lib/avro.rb +42 -0
- data/lib/avro/data_file.rb +366 -0
- data/lib/avro/io.rb +619 -0
- data/lib/avro/ipc.rb +551 -0
- data/lib/avro/logical_types.rb +84 -0
- data/lib/avro/protocol.rb +161 -0
- data/lib/avro/schema.rb +434 -0
- data/lib/avro/schema_normalization.rb +83 -0
- data/test/case_finder.rb +87 -0
- data/test/random_data.rb +90 -0
- data/test/sample_ipc_client.rb +85 -0
- data/test/sample_ipc_http_client.rb +84 -0
- data/test/sample_ipc_http_server.rb +79 -0
- data/test/sample_ipc_server.rb +92 -0
- data/test/test_datafile.rb +214 -0
- data/test/test_fingerprints.rb +37 -0
- data/test/test_help.rb +23 -0
- data/test/test_io.rb +451 -0
- data/test/test_logical_types.rb +111 -0
- data/test/test_protocol.rb +199 -0
- data/test/test_schema.rb +146 -0
- data/test/test_schema_normalization.rb +171 -0
- data/test/test_socket_transport.rb +40 -0
- data/test/tool.rb +144 -0
- metadata +114 -0
@@ -0,0 +1,366 @@
|
|
1
|
+
# Licensed to the Apache Software Foundation (ASF) under one
|
2
|
+
# or more contributor license agreements. See the NOTICE file
|
3
|
+
# distributed with this work for additional information
|
4
|
+
# regarding copyright ownership. The ASF licenses this file
|
5
|
+
# to you under the Apache License, Version 2.0 (the
|
6
|
+
# "License"); you may not use this file except in compliance
|
7
|
+
# with the License. You may obtain a copy of the License at
|
8
|
+
#
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
10
|
+
#
|
11
|
+
# Unless required by applicable law or agreed to in writing, software
|
12
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
13
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
14
|
+
# See the License for the specific language governing permissions and
|
15
|
+
# limitations under the License.
|
16
|
+
|
17
|
+
require 'openssl'
|
18
|
+
|
19
|
+
module Avro
|
20
|
+
module DataFile
|
21
|
+
VERSION = 1
|
22
|
+
MAGIC = "Obj" + [VERSION].pack('c')
|
23
|
+
MAGIC.force_encoding('BINARY') if MAGIC.respond_to?(:force_encoding)
|
24
|
+
MAGIC_SIZE = MAGIC.respond_to?(:bytesize) ? MAGIC.bytesize : MAGIC.size
|
25
|
+
SYNC_SIZE = 16
|
26
|
+
SYNC_INTERVAL = 4000 * SYNC_SIZE
|
27
|
+
META_SCHEMA = Schema.parse('{"type": "map", "values": "bytes"}')
|
28
|
+
VALID_ENCODINGS = ['binary'] # not used yet
|
29
|
+
|
30
|
+
class DataFileError < AvroError; end
|
31
|
+
|
32
|
+
def self.open(file_path, mode='r', schema=nil, codec=nil)
|
33
|
+
schema = Avro::Schema.parse(schema) if schema
|
34
|
+
case mode
|
35
|
+
when 'w'
|
36
|
+
unless schema
|
37
|
+
raise DataFileError, "Writing an Avro file requires a schema."
|
38
|
+
end
|
39
|
+
io = open_writer(File.open(file_path, 'wb'), schema, codec)
|
40
|
+
when 'r'
|
41
|
+
io = open_reader(File.open(file_path, 'rb'), schema)
|
42
|
+
else
|
43
|
+
raise DataFileError, "Only modes 'r' and 'w' allowed. You gave #{mode.inspect}."
|
44
|
+
end
|
45
|
+
|
46
|
+
yield io if block_given?
|
47
|
+
io
|
48
|
+
ensure
|
49
|
+
io.close if block_given? && io
|
50
|
+
end
|
51
|
+
|
52
|
+
def self.codecs
|
53
|
+
@codecs
|
54
|
+
end
|
55
|
+
|
56
|
+
def self.register_codec(codec)
|
57
|
+
@codecs ||= {}
|
58
|
+
codec = codec.new if !codec.respond_to?(:codec_name) && codec.is_a?(Class)
|
59
|
+
@codecs[codec.codec_name.to_s] = codec
|
60
|
+
end
|
61
|
+
|
62
|
+
def self.get_codec(codec)
|
63
|
+
codec ||= 'null'
|
64
|
+
if codec.respond_to?(:compress) && codec.respond_to?(:decompress)
|
65
|
+
codec # it's a codec instance
|
66
|
+
elsif codec.is_a?(Class)
|
67
|
+
codec.new # it's a codec class
|
68
|
+
elsif @codecs.include?(codec.to_s)
|
69
|
+
@codecs[codec.to_s] # it's a string or symbol (codec name)
|
70
|
+
else
|
71
|
+
raise DataFileError, "Unknown codec: #{codec.inspect}"
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
class << self
|
76
|
+
private
|
77
|
+
def open_writer(file, schema, codec=nil)
|
78
|
+
writer = Avro::IO::DatumWriter.new(schema)
|
79
|
+
Avro::DataFile::Writer.new(file, writer, schema, codec)
|
80
|
+
end
|
81
|
+
|
82
|
+
def open_reader(file, schema)
|
83
|
+
reader = Avro::IO::DatumReader.new(nil, schema)
|
84
|
+
Avro::DataFile::Reader.new(file, reader)
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
class Writer
|
89
|
+
def self.generate_sync_marker
|
90
|
+
OpenSSL::Random.random_bytes(16)
|
91
|
+
end
|
92
|
+
|
93
|
+
attr_reader :writer, :encoder, :datum_writer, :buffer_writer, :buffer_encoder, :sync_marker, :meta, :codec
|
94
|
+
attr_accessor :block_count
|
95
|
+
|
96
|
+
def initialize(writer, datum_writer, writers_schema=nil, codec=nil, meta={})
|
97
|
+
# If writers_schema is not present, presume we're appending
|
98
|
+
@writer = writer
|
99
|
+
@encoder = IO::BinaryEncoder.new(@writer)
|
100
|
+
@datum_writer = datum_writer
|
101
|
+
@meta = meta
|
102
|
+
@buffer_writer = StringIO.new('', 'w')
|
103
|
+
@buffer_writer.set_encoding('BINARY') if @buffer_writer.respond_to?(:set_encoding)
|
104
|
+
@buffer_encoder = IO::BinaryEncoder.new(@buffer_writer)
|
105
|
+
@block_count = 0
|
106
|
+
|
107
|
+
if writers_schema
|
108
|
+
@sync_marker = Writer.generate_sync_marker
|
109
|
+
@codec = DataFile.get_codec(codec)
|
110
|
+
@meta['avro.codec'] = @codec.codec_name.to_s
|
111
|
+
@meta['avro.schema'] = writers_schema.to_s
|
112
|
+
datum_writer.writers_schema = writers_schema
|
113
|
+
write_header
|
114
|
+
else
|
115
|
+
# open writer for reading to collect metadata
|
116
|
+
dfr = Reader.new(writer, Avro::IO::DatumReader.new)
|
117
|
+
|
118
|
+
# FIXME(jmhodges): collect arbitrary metadata
|
119
|
+
# collect metadata
|
120
|
+
@sync_marker = dfr.sync_marker
|
121
|
+
@meta['avro.codec'] = dfr.meta['avro.codec']
|
122
|
+
@codec = DataFile.get_codec(meta['avro.codec'])
|
123
|
+
|
124
|
+
# get schema used to write existing file
|
125
|
+
schema_from_file = dfr.meta['avro.schema']
|
126
|
+
@meta['avro.schema'] = schema_from_file
|
127
|
+
datum_writer.writers_schema = Schema.parse(schema_from_file)
|
128
|
+
|
129
|
+
# seek to the end of the file and prepare for writing
|
130
|
+
writer.seek(0,2)
|
131
|
+
end
|
132
|
+
end
|
133
|
+
|
134
|
+
# Append a datum to the file
|
135
|
+
def <<(datum)
|
136
|
+
datum_writer.write(datum, buffer_encoder)
|
137
|
+
self.block_count += 1
|
138
|
+
|
139
|
+
# if the data to write is larger than the sync interval, write
|
140
|
+
# the block
|
141
|
+
if buffer_writer.tell >= SYNC_INTERVAL
|
142
|
+
write_block
|
143
|
+
end
|
144
|
+
end
|
145
|
+
|
146
|
+
# Return the current position as a value that may be passed to
|
147
|
+
# DataFileReader.seek(long). Forces the end of the current block,
|
148
|
+
# emitting a synchronization marker.
|
149
|
+
def sync
|
150
|
+
write_block
|
151
|
+
writer.tell
|
152
|
+
end
|
153
|
+
|
154
|
+
# Flush the current state of the file, including metadata
|
155
|
+
def flush
|
156
|
+
write_block
|
157
|
+
writer.flush
|
158
|
+
end
|
159
|
+
|
160
|
+
def close
|
161
|
+
flush
|
162
|
+
writer.close
|
163
|
+
end
|
164
|
+
|
165
|
+
private
|
166
|
+
|
167
|
+
def write_header
|
168
|
+
# write magic
|
169
|
+
writer.write(MAGIC)
|
170
|
+
|
171
|
+
# write metadata
|
172
|
+
datum_writer.write_data(META_SCHEMA, meta, encoder)
|
173
|
+
|
174
|
+
# write sync marker
|
175
|
+
writer.write(sync_marker)
|
176
|
+
end
|
177
|
+
|
178
|
+
# TODO(jmhodges): make a schema for blocks and use datum_writer
|
179
|
+
# TODO(jmhodges): do we really need the number of items in the block?
|
180
|
+
def write_block
|
181
|
+
if block_count > 0
|
182
|
+
# write number of items in block and block size in bytes
|
183
|
+
encoder.write_long(block_count)
|
184
|
+
to_write = codec.compress(buffer_writer.string)
|
185
|
+
encoder.write_long(to_write.respond_to?(:bytesize) ? to_write.bytesize : to_write.size)
|
186
|
+
|
187
|
+
# write block contents
|
188
|
+
writer.write(to_write)
|
189
|
+
|
190
|
+
# write sync marker
|
191
|
+
writer.write(sync_marker)
|
192
|
+
|
193
|
+
# reset buffer
|
194
|
+
buffer_writer.truncate(0)
|
195
|
+
buffer_writer.rewind
|
196
|
+
self.block_count = 0
|
197
|
+
end
|
198
|
+
end
|
199
|
+
end
|
200
|
+
|
201
|
+
# Read files written by DataFileWriter
|
202
|
+
class Reader
|
203
|
+
include ::Enumerable
|
204
|
+
|
205
|
+
# The reader and binary decoder for the raw file stream
|
206
|
+
attr_reader :reader, :decoder
|
207
|
+
|
208
|
+
# The binary decoder for the contents of a block (after codec decompression)
|
209
|
+
attr_reader :block_decoder
|
210
|
+
|
211
|
+
attr_reader :datum_reader, :sync_marker, :meta, :file_length, :codec
|
212
|
+
attr_accessor :block_count # records remaining in current block
|
213
|
+
|
214
|
+
def initialize(reader, datum_reader)
|
215
|
+
@reader = reader
|
216
|
+
@decoder = IO::BinaryDecoder.new(reader)
|
217
|
+
@datum_reader = datum_reader
|
218
|
+
|
219
|
+
# read the header: magic, meta, sync
|
220
|
+
read_header
|
221
|
+
|
222
|
+
@codec = DataFile.get_codec(meta['avro.codec'])
|
223
|
+
|
224
|
+
# get ready to read
|
225
|
+
@block_count = 0
|
226
|
+
datum_reader.writers_schema = Schema.parse meta['avro.schema']
|
227
|
+
end
|
228
|
+
|
229
|
+
# Iterates through each datum in this file
|
230
|
+
# TODO(jmhodges): handle block of length zero
|
231
|
+
def each
|
232
|
+
loop do
|
233
|
+
if block_count == 0
|
234
|
+
case
|
235
|
+
when eof?; break
|
236
|
+
when skip_sync
|
237
|
+
break if eof?
|
238
|
+
read_block_header
|
239
|
+
else
|
240
|
+
read_block_header
|
241
|
+
end
|
242
|
+
end
|
243
|
+
|
244
|
+
datum = datum_reader.read(block_decoder)
|
245
|
+
self.block_count -= 1
|
246
|
+
yield(datum)
|
247
|
+
end
|
248
|
+
end
|
249
|
+
|
250
|
+
def eof?; reader.eof?; end
|
251
|
+
|
252
|
+
def close
|
253
|
+
reader.close
|
254
|
+
end
|
255
|
+
|
256
|
+
private
|
257
|
+
def read_header
|
258
|
+
# seek to the beginning of the file to get magic block
|
259
|
+
reader.seek(0, 0)
|
260
|
+
|
261
|
+
# check magic number
|
262
|
+
magic_in_file = reader.read(MAGIC_SIZE)
|
263
|
+
if magic_in_file.size < MAGIC_SIZE
|
264
|
+
msg = 'Not an Avro data file: shorter than the Avro magic block'
|
265
|
+
raise DataFileError, msg
|
266
|
+
elsif magic_in_file != MAGIC
|
267
|
+
msg = "Not an Avro data file: #{magic_in_file.inspect} doesn't match #{MAGIC.inspect}"
|
268
|
+
raise DataFileError, msg
|
269
|
+
end
|
270
|
+
|
271
|
+
# read metadata
|
272
|
+
@meta = datum_reader.read_data(META_SCHEMA,
|
273
|
+
META_SCHEMA,
|
274
|
+
decoder)
|
275
|
+
# read sync marker
|
276
|
+
@sync_marker = reader.read(SYNC_SIZE)
|
277
|
+
end
|
278
|
+
|
279
|
+
def read_block_header
|
280
|
+
self.block_count = decoder.read_long
|
281
|
+
block_bytes = decoder.read_long
|
282
|
+
data = codec.decompress(reader.read(block_bytes))
|
283
|
+
@block_decoder = IO::BinaryDecoder.new(StringIO.new(data))
|
284
|
+
end
|
285
|
+
|
286
|
+
# read the length of the sync marker; if it matches the sync
|
287
|
+
# marker, return true. Otherwise, seek back to where we started
|
288
|
+
# and return false
|
289
|
+
def skip_sync
|
290
|
+
proposed_sync_marker = reader.read(SYNC_SIZE)
|
291
|
+
if proposed_sync_marker != sync_marker
|
292
|
+
reader.seek(-SYNC_SIZE, 1)
|
293
|
+
false
|
294
|
+
else
|
295
|
+
true
|
296
|
+
end
|
297
|
+
end
|
298
|
+
end
|
299
|
+
|
300
|
+
|
301
|
+
class NullCodec
|
302
|
+
def codec_name; 'null'; end
|
303
|
+
def decompress(data); data; end
|
304
|
+
def compress(data); data; end
|
305
|
+
end
|
306
|
+
|
307
|
+
class DeflateCodec
|
308
|
+
attr_reader :level
|
309
|
+
|
310
|
+
def initialize(level=Zlib::DEFAULT_COMPRESSION)
|
311
|
+
@level = level
|
312
|
+
end
|
313
|
+
|
314
|
+
def codec_name; 'deflate'; end
|
315
|
+
|
316
|
+
def decompress(compressed)
|
317
|
+
# Passing a negative number to Inflate puts it into "raw" RFC1951 mode
|
318
|
+
# (without the RFC1950 header & checksum). See the docs for
|
319
|
+
# inflateInit2 in http://www.zlib.net/manual.html
|
320
|
+
zstream = Zlib::Inflate.new(-Zlib::MAX_WBITS)
|
321
|
+
data = zstream.inflate(compressed)
|
322
|
+
data << zstream.finish
|
323
|
+
ensure
|
324
|
+
zstream.close
|
325
|
+
end
|
326
|
+
|
327
|
+
def compress(data)
|
328
|
+
zstream = Zlib::Deflate.new(level, -Zlib::MAX_WBITS)
|
329
|
+
compressed = zstream.deflate(data)
|
330
|
+
compressed << zstream.finish
|
331
|
+
ensure
|
332
|
+
zstream.close
|
333
|
+
end
|
334
|
+
end
|
335
|
+
|
336
|
+
class SnappyCodec
|
337
|
+
def codec_name; 'snappy'; end
|
338
|
+
|
339
|
+
def decompress(data)
|
340
|
+
load_snappy!
|
341
|
+
Snappy.inflate(data)
|
342
|
+
end
|
343
|
+
|
344
|
+
def compress(data)
|
345
|
+
load_snappy!
|
346
|
+
Snappy.deflate(data)
|
347
|
+
end
|
348
|
+
|
349
|
+
private
|
350
|
+
|
351
|
+
def load_snappy!
|
352
|
+
require 'snappy' unless defined?(Snappy)
|
353
|
+
rescue LoadError
|
354
|
+
raise LoadError, "Snappy compression is not available, please install the `snappy` gem."
|
355
|
+
end
|
356
|
+
end
|
357
|
+
|
358
|
+
DataFile.register_codec NullCodec
|
359
|
+
DataFile.register_codec DeflateCodec
|
360
|
+
DataFile.register_codec SnappyCodec
|
361
|
+
|
362
|
+
# TODO this constant won't be updated if you register another codec.
|
363
|
+
# Deprecated in favor of Avro::DataFile::codecs
|
364
|
+
VALID_CODECS = DataFile.codecs.keys
|
365
|
+
end
|
366
|
+
end
|
data/lib/avro/io.rb
ADDED
@@ -0,0 +1,619 @@
|
|
1
|
+
# Licensed to the Apache Software Foundation (ASF) under one
|
2
|
+
# or more contributor license agreements. See the NOTICE file
|
3
|
+
# distributed with this work for additional information
|
4
|
+
# regarding copyright ownership. The ASF licenses this file
|
5
|
+
# to you under the Apache License, Version 2.0 (the
|
6
|
+
# "License"); you may not use this file except in compliance
|
7
|
+
# with the License. You may obtain a copy of the License at
|
8
|
+
#
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
10
|
+
#
|
11
|
+
# Unless required by applicable law or agreed to in writing, software
|
12
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
13
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
14
|
+
# See the License for the specific language governing permissions and
|
15
|
+
# limitations under the License.
|
16
|
+
|
17
|
+
module Avro
|
18
|
+
module IO
|
19
|
+
# Raised when datum is not an example of schema
|
20
|
+
class AvroTypeError < AvroError
|
21
|
+
def initialize(expected_schema, datum)
|
22
|
+
super("The datum #{datum.inspect} is not an example of schema #{expected_schema}")
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
# Raised when writer's and reader's schema do not match
|
27
|
+
class SchemaMatchException < AvroError
|
28
|
+
def initialize(writers_schema, readers_schema)
|
29
|
+
super("Writer's schema #{writers_schema} and Reader's schema " +
|
30
|
+
"#{readers_schema} do not match.")
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
# FIXME(jmhodges) move validate to this module?
|
35
|
+
|
36
|
+
class BinaryDecoder
|
37
|
+
# Read leaf values
|
38
|
+
|
39
|
+
# reader is an object on which we can call read, seek and tell.
|
40
|
+
attr_reader :reader
|
41
|
+
def initialize(reader)
|
42
|
+
@reader = reader
|
43
|
+
end
|
44
|
+
|
45
|
+
def byte!
|
46
|
+
@reader.read(1).unpack('C').first
|
47
|
+
end
|
48
|
+
|
49
|
+
def read_null
|
50
|
+
# null is written as zero byte's
|
51
|
+
nil
|
52
|
+
end
|
53
|
+
|
54
|
+
def read_boolean
|
55
|
+
byte! == 1
|
56
|
+
end
|
57
|
+
|
58
|
+
def read_int; read_long; end
|
59
|
+
|
60
|
+
def read_long
|
61
|
+
# int and long values are written using variable-length,
|
62
|
+
# zig-zag coding.
|
63
|
+
b = byte!
|
64
|
+
n = b & 0x7F
|
65
|
+
shift = 7
|
66
|
+
while (b & 0x80) != 0
|
67
|
+
b = byte!
|
68
|
+
n |= (b & 0x7F) << shift
|
69
|
+
shift += 7
|
70
|
+
end
|
71
|
+
(n >> 1) ^ -(n & 1)
|
72
|
+
end
|
73
|
+
|
74
|
+
def read_float
|
75
|
+
# A float is written as 4 bytes.
|
76
|
+
# The float is converted into a 32-bit integer using a method
|
77
|
+
# equivalent to Java's floatToIntBits and then encoded in
|
78
|
+
# little-endian format.
|
79
|
+
@reader.read(4).unpack('e')[0]
|
80
|
+
end
|
81
|
+
|
82
|
+
def read_double
|
83
|
+
# A double is written as 8 bytes.
|
84
|
+
# The double is converted into a 64-bit integer using a method
|
85
|
+
# equivalent to Java's doubleToLongBits and then encoded in
|
86
|
+
# little-endian format.
|
87
|
+
@reader.read(8).unpack('E')[0]
|
88
|
+
end
|
89
|
+
|
90
|
+
def read_bytes
|
91
|
+
# Bytes are encoded as a long followed by that many bytes of
|
92
|
+
# data.
|
93
|
+
read(read_long)
|
94
|
+
end
|
95
|
+
|
96
|
+
def read_string
|
97
|
+
# A string is encoded as a long followed by that many bytes of
|
98
|
+
# UTF-8 encoded character data.
|
99
|
+
read_bytes.tap do |string|
|
100
|
+
string.force_encoding("UTF-8") if string.respond_to? :force_encoding
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
def read(len)
|
105
|
+
# Read n bytes
|
106
|
+
@reader.read(len)
|
107
|
+
end
|
108
|
+
|
109
|
+
def skip_null
|
110
|
+
nil
|
111
|
+
end
|
112
|
+
|
113
|
+
def skip_boolean
|
114
|
+
skip(1)
|
115
|
+
end
|
116
|
+
|
117
|
+
def skip_int
|
118
|
+
skip_long
|
119
|
+
end
|
120
|
+
|
121
|
+
def skip_long
|
122
|
+
b = byte!
|
123
|
+
while (b & 0x80) != 0
|
124
|
+
b = byte!
|
125
|
+
end
|
126
|
+
end
|
127
|
+
|
128
|
+
def skip_float
|
129
|
+
skip(4)
|
130
|
+
end
|
131
|
+
|
132
|
+
def skip_double
|
133
|
+
skip(8)
|
134
|
+
end
|
135
|
+
|
136
|
+
def skip_bytes
|
137
|
+
skip(read_long)
|
138
|
+
end
|
139
|
+
|
140
|
+
def skip_string
|
141
|
+
skip_bytes
|
142
|
+
end
|
143
|
+
|
144
|
+
def skip(n)
|
145
|
+
reader.seek(reader.tell() + n)
|
146
|
+
end
|
147
|
+
end
|
148
|
+
|
149
|
+
# Write leaf values
|
150
|
+
class BinaryEncoder
|
151
|
+
attr_reader :writer
|
152
|
+
|
153
|
+
def initialize(writer)
|
154
|
+
@writer = writer
|
155
|
+
end
|
156
|
+
|
157
|
+
# null is written as zero bytes
|
158
|
+
def write_null(datum)
|
159
|
+
nil
|
160
|
+
end
|
161
|
+
|
162
|
+
# a boolean is written as a single byte
|
163
|
+
# whose value is either 0 (false) or 1 (true).
|
164
|
+
def write_boolean(datum)
|
165
|
+
on_disk = datum ? 1.chr : 0.chr
|
166
|
+
writer.write(on_disk)
|
167
|
+
end
|
168
|
+
|
169
|
+
# int and long values are written using variable-length,
|
170
|
+
# zig-zag coding.
|
171
|
+
def write_int(n)
|
172
|
+
write_long(n)
|
173
|
+
end
|
174
|
+
|
175
|
+
# int and long values are written using variable-length,
|
176
|
+
# zig-zag coding.
|
177
|
+
def write_long(n)
|
178
|
+
foo = n
|
179
|
+
n = (n << 1) ^ (n >> 63)
|
180
|
+
while (n & ~0x7F) != 0
|
181
|
+
@writer.write(((n & 0x7f) | 0x80).chr)
|
182
|
+
n >>= 7
|
183
|
+
end
|
184
|
+
@writer.write(n.chr)
|
185
|
+
end
|
186
|
+
|
187
|
+
# A float is written as 4 bytes.
|
188
|
+
# The float is converted into a 32-bit integer using a method
|
189
|
+
# equivalent to Java's floatToIntBits and then encoded in
|
190
|
+
# little-endian format.
|
191
|
+
def write_float(datum)
|
192
|
+
@writer.write([datum].pack('e'))
|
193
|
+
end
|
194
|
+
|
195
|
+
# A double is written as 8 bytes.
|
196
|
+
# The double is converted into a 64-bit integer using a method
|
197
|
+
# equivalent to Java's doubleToLongBits and then encoded in
|
198
|
+
# little-endian format.
|
199
|
+
def write_double(datum)
|
200
|
+
@writer.write([datum].pack('E'))
|
201
|
+
end
|
202
|
+
|
203
|
+
# Bytes are encoded as a long followed by that many bytes of data.
|
204
|
+
def write_bytes(datum)
|
205
|
+
write_long(datum.bytesize)
|
206
|
+
@writer.write(datum)
|
207
|
+
end
|
208
|
+
|
209
|
+
# A string is encoded as a long followed by that many bytes of
|
210
|
+
# UTF-8 encoded character data
|
211
|
+
def write_string(datum)
|
212
|
+
datum = datum.encode('utf-8') if datum.respond_to? :encode
|
213
|
+
write_bytes(datum)
|
214
|
+
end
|
215
|
+
|
216
|
+
# Write an arbritary datum.
|
217
|
+
def write(datum)
|
218
|
+
writer.write(datum)
|
219
|
+
end
|
220
|
+
end
|
221
|
+
|
222
|
+
class DatumReader
|
223
|
+
def self.match_schemas(writers_schema, readers_schema)
|
224
|
+
w_type = writers_schema.type_sym
|
225
|
+
r_type = readers_schema.type_sym
|
226
|
+
|
227
|
+
# This conditional is begging for some OO love.
|
228
|
+
if w_type == :union || r_type == :union
|
229
|
+
return true
|
230
|
+
end
|
231
|
+
|
232
|
+
if w_type == r_type
|
233
|
+
return true if Schema::PRIMITIVE_TYPES_SYM.include?(r_type)
|
234
|
+
|
235
|
+
case r_type
|
236
|
+
when :record
|
237
|
+
return writers_schema.fullname == readers_schema.fullname
|
238
|
+
when :error
|
239
|
+
return writers_schema.fullname == readers_schema.fullname
|
240
|
+
when :request
|
241
|
+
return true
|
242
|
+
when :fixed
|
243
|
+
return writers_schema.fullname == readers_schema.fullname &&
|
244
|
+
writers_schema.size == readers_schema.size
|
245
|
+
when :enum
|
246
|
+
return writers_schema.fullname == readers_schema.fullname
|
247
|
+
when :map
|
248
|
+
return writers_schema.values.type == readers_schema.values.type
|
249
|
+
when :array
|
250
|
+
return writers_schema.items.type == readers_schema.items.type
|
251
|
+
end
|
252
|
+
end
|
253
|
+
|
254
|
+
# Handle schema promotion
|
255
|
+
if w_type == :int && [:long, :float, :double].include?(r_type)
|
256
|
+
return true
|
257
|
+
elsif w_type == :long && [:float, :double].include?(r_type)
|
258
|
+
return true
|
259
|
+
elsif w_type == :float && r_type == :double
|
260
|
+
return true
|
261
|
+
end
|
262
|
+
|
263
|
+
return false
|
264
|
+
end
|
265
|
+
|
266
|
+
attr_accessor :writers_schema, :readers_schema
|
267
|
+
|
268
|
+
def initialize(writers_schema=nil, readers_schema=nil)
|
269
|
+
@writers_schema = writers_schema
|
270
|
+
@readers_schema = readers_schema
|
271
|
+
end
|
272
|
+
|
273
|
+
def read(decoder)
|
274
|
+
self.readers_schema = writers_schema unless readers_schema
|
275
|
+
read_data(writers_schema, readers_schema, decoder)
|
276
|
+
end
|
277
|
+
|
278
|
+
def read_data(writers_schema, readers_schema, decoder)
|
279
|
+
# schema matching
|
280
|
+
unless self.class.match_schemas(writers_schema, readers_schema)
|
281
|
+
raise SchemaMatchException.new(writers_schema, readers_schema)
|
282
|
+
end
|
283
|
+
|
284
|
+
# schema resolution: reader's schema is a union, writer's
|
285
|
+
# schema is not
|
286
|
+
if writers_schema.type_sym != :union && readers_schema.type_sym == :union
|
287
|
+
rs = readers_schema.schemas.find{|s|
|
288
|
+
self.class.match_schemas(writers_schema, s)
|
289
|
+
}
|
290
|
+
return read_data(writers_schema, rs, decoder) if rs
|
291
|
+
raise SchemaMatchException.new(writers_schema, readers_schema)
|
292
|
+
end
|
293
|
+
|
294
|
+
# function dispatch for reading data based on type of writer's
|
295
|
+
# schema
|
296
|
+
datum = case writers_schema.type_sym
|
297
|
+
when :null; decoder.read_null
|
298
|
+
when :boolean; decoder.read_boolean
|
299
|
+
when :string; decoder.read_string
|
300
|
+
when :int; decoder.read_int
|
301
|
+
when :long; decoder.read_long
|
302
|
+
when :float; decoder.read_float
|
303
|
+
when :double; decoder.read_double
|
304
|
+
when :bytes; decoder.read_bytes
|
305
|
+
when :fixed; read_fixed(writers_schema, readers_schema, decoder)
|
306
|
+
when :enum; read_enum(writers_schema, readers_schema, decoder)
|
307
|
+
when :array; read_array(writers_schema, readers_schema, decoder)
|
308
|
+
when :map; read_map(writers_schema, readers_schema, decoder)
|
309
|
+
when :union; read_union(writers_schema, readers_schema, decoder)
|
310
|
+
when :record, :error, :request; read_record(writers_schema, readers_schema, decoder)
|
311
|
+
else
|
312
|
+
raise AvroError, "Cannot read unknown schema type: #{writers_schema.type}"
|
313
|
+
end
|
314
|
+
|
315
|
+
readers_schema.type_adapter.decode(datum)
|
316
|
+
end
|
317
|
+
|
318
|
+
def read_fixed(writers_schema, readers_schema, decoder)
|
319
|
+
decoder.read(writers_schema.size)
|
320
|
+
end
|
321
|
+
|
322
|
+
def read_enum(writers_schema, readers_schema, decoder)
|
323
|
+
index_of_symbol = decoder.read_int
|
324
|
+
read_symbol = writers_schema.symbols[index_of_symbol]
|
325
|
+
|
326
|
+
# TODO(jmhodges): figure out what unset means for resolution
|
327
|
+
# schema resolution
|
328
|
+
unless readers_schema.symbols.include?(read_symbol)
|
329
|
+
# 'unset' here
|
330
|
+
end
|
331
|
+
|
332
|
+
read_symbol
|
333
|
+
end
|
334
|
+
|
335
|
+
def read_array(writers_schema, readers_schema, decoder)
|
336
|
+
read_items = []
|
337
|
+
block_count = decoder.read_long
|
338
|
+
while block_count != 0
|
339
|
+
if block_count < 0
|
340
|
+
block_count = -block_count
|
341
|
+
block_size = decoder.read_long
|
342
|
+
end
|
343
|
+
block_count.times do
|
344
|
+
read_items << read_data(writers_schema.items,
|
345
|
+
readers_schema.items,
|
346
|
+
decoder)
|
347
|
+
end
|
348
|
+
block_count = decoder.read_long
|
349
|
+
end
|
350
|
+
|
351
|
+
read_items
|
352
|
+
end
|
353
|
+
|
354
|
+
def read_map(writers_schema, readers_schema, decoder)
|
355
|
+
read_items = {}
|
356
|
+
block_count = decoder.read_long
|
357
|
+
while block_count != 0
|
358
|
+
if block_count < 0
|
359
|
+
block_count = -block_count
|
360
|
+
block_size = decoder.read_long
|
361
|
+
end
|
362
|
+
block_count.times do
|
363
|
+
key = decoder.read_string
|
364
|
+
read_items[key] = read_data(writers_schema.values,
|
365
|
+
readers_schema.values,
|
366
|
+
decoder)
|
367
|
+
end
|
368
|
+
block_count = decoder.read_long
|
369
|
+
end
|
370
|
+
|
371
|
+
read_items
|
372
|
+
end
|
373
|
+
|
374
|
+
def read_union(writers_schema, readers_schema, decoder)
|
375
|
+
index_of_schema = decoder.read_long
|
376
|
+
selected_writers_schema = writers_schema.schemas[index_of_schema]
|
377
|
+
|
378
|
+
read_data(selected_writers_schema, readers_schema, decoder)
|
379
|
+
end
|
380
|
+
|
381
|
+
def read_record(writers_schema, readers_schema, decoder)
|
382
|
+
readers_fields_hash = readers_schema.fields_hash
|
383
|
+
read_record = {}
|
384
|
+
writers_schema.fields.each do |field|
|
385
|
+
if readers_field = readers_fields_hash[field.name]
|
386
|
+
field_val = read_data(field.type, readers_field.type, decoder)
|
387
|
+
read_record[field.name] = field_val
|
388
|
+
else
|
389
|
+
skip_data(field.type, decoder)
|
390
|
+
end
|
391
|
+
end
|
392
|
+
|
393
|
+
# fill in the default values
|
394
|
+
if readers_fields_hash.size > read_record.size
|
395
|
+
writers_fields_hash = writers_schema.fields_hash
|
396
|
+
readers_fields_hash.each do |field_name, field|
|
397
|
+
unless writers_fields_hash.has_key? field_name
|
398
|
+
if !field.default.nil?
|
399
|
+
field_val = read_default_value(field.type, field.default)
|
400
|
+
read_record[field.name] = field_val
|
401
|
+
else
|
402
|
+
# FIXME(jmhodges) another 'unset' here
|
403
|
+
end
|
404
|
+
end
|
405
|
+
end
|
406
|
+
end
|
407
|
+
|
408
|
+
read_record
|
409
|
+
end
|
410
|
+
|
411
|
+
def read_default_value(field_schema, default_value)
|
412
|
+
# Basically a JSON Decoder?
|
413
|
+
case field_schema.type_sym
|
414
|
+
when :null
|
415
|
+
return nil
|
416
|
+
when :boolean
|
417
|
+
return default_value
|
418
|
+
when :int, :long
|
419
|
+
return Integer(default_value)
|
420
|
+
when :float, :double
|
421
|
+
return Float(default_value)
|
422
|
+
when :enum, :fixed, :string, :bytes
|
423
|
+
return default_value
|
424
|
+
when :array
|
425
|
+
read_array = []
|
426
|
+
default_value.each do |json_val|
|
427
|
+
item_val = read_default_value(field_schema.items, json_val)
|
428
|
+
read_array << item_val
|
429
|
+
end
|
430
|
+
return read_array
|
431
|
+
when :map
|
432
|
+
read_map = {}
|
433
|
+
default_value.each do |key, json_val|
|
434
|
+
map_val = read_default_value(field_schema.values, json_val)
|
435
|
+
read_map[key] = map_val
|
436
|
+
end
|
437
|
+
return read_map
|
438
|
+
when :union
|
439
|
+
return read_default_value(field_schema.schemas[0], default_value)
|
440
|
+
when :record, :error
|
441
|
+
read_record = {}
|
442
|
+
field_schema.fields.each do |field|
|
443
|
+
json_val = default_value[field.name]
|
444
|
+
json_val = field.default unless json_val
|
445
|
+
field_val = read_default_value(field.type, json_val)
|
446
|
+
read_record[field.name] = field_val
|
447
|
+
end
|
448
|
+
return read_record
|
449
|
+
else
|
450
|
+
fail_msg = "Unknown type: #{field_schema.type}"
|
451
|
+
raise AvroError, fail_msg
|
452
|
+
end
|
453
|
+
end
|
454
|
+
|
455
|
+
def skip_data(writers_schema, decoder)
|
456
|
+
case writers_schema.type_sym
|
457
|
+
when :null
|
458
|
+
decoder.skip_null
|
459
|
+
when :boolean
|
460
|
+
decoder.skip_boolean
|
461
|
+
when :string
|
462
|
+
decoder.skip_string
|
463
|
+
when :int
|
464
|
+
decoder.skip_int
|
465
|
+
when :long
|
466
|
+
decoder.skip_long
|
467
|
+
when :float
|
468
|
+
decoder.skip_float
|
469
|
+
when :double
|
470
|
+
decoder.skip_double
|
471
|
+
when :bytes
|
472
|
+
decoder.skip_bytes
|
473
|
+
when :fixed
|
474
|
+
skip_fixed(writers_schema, decoder)
|
475
|
+
when :enum
|
476
|
+
skip_enum(writers_schema, decoder)
|
477
|
+
when :array
|
478
|
+
skip_array(writers_schema, decoder)
|
479
|
+
when :map
|
480
|
+
skip_map(writers_schema, decoder)
|
481
|
+
when :union
|
482
|
+
skip_union(writers_schema, decoder)
|
483
|
+
when :record, :error, :request
|
484
|
+
skip_record(writers_schema, decoder)
|
485
|
+
else
|
486
|
+
raise AvroError, "Unknown schema type: #{writers_schema.type}"
|
487
|
+
end
|
488
|
+
end
|
489
|
+
|
490
|
+
def skip_fixed(writers_schema, decoder)
|
491
|
+
decoder.skip(writers_schema.size)
|
492
|
+
end
|
493
|
+
|
494
|
+
def skip_enum(writers_schema, decoder)
|
495
|
+
decoder.skip_int
|
496
|
+
end
|
497
|
+
|
498
|
+
def skip_union(writers_schema, decoder)
|
499
|
+
index = decoder.read_long
|
500
|
+
skip_data(writers_schema.schemas[index], decoder)
|
501
|
+
end
|
502
|
+
|
503
|
+
def skip_array(writers_schema, decoder)
|
504
|
+
skip_blocks(decoder) { skip_data(writers_schema.items, decoder) }
|
505
|
+
end
|
506
|
+
|
507
|
+
def skip_map(writers_schema, decoder)
|
508
|
+
skip_blocks(decoder) {
|
509
|
+
decoder.skip_string
|
510
|
+
skip_data(writers_schema.values, decoder)
|
511
|
+
}
|
512
|
+
end
|
513
|
+
|
514
|
+
def skip_record(writers_schema, decoder)
|
515
|
+
writers_schema.fields.each{|f| skip_data(f.type, decoder) }
|
516
|
+
end
|
517
|
+
|
518
|
+
private
|
519
|
+
def skip_blocks(decoder, &blk)
|
520
|
+
block_count = decoder.read_long
|
521
|
+
while block_count != 0
|
522
|
+
if block_count < 0
|
523
|
+
decoder.skip(decoder.read_long)
|
524
|
+
else
|
525
|
+
block_count.times &blk
|
526
|
+
end
|
527
|
+
block_count = decoder.read_long
|
528
|
+
end
|
529
|
+
end
|
530
|
+
end # DatumReader
|
531
|
+
|
532
|
+
# DatumWriter for generic ruby objects
|
533
|
+
class DatumWriter
|
534
|
+
attr_accessor :writers_schema
|
535
|
+
def initialize(writers_schema=nil)
|
536
|
+
@writers_schema = writers_schema
|
537
|
+
end
|
538
|
+
|
539
|
+
def write(datum, encoder)
|
540
|
+
write_data(writers_schema, datum, encoder)
|
541
|
+
end
|
542
|
+
|
543
|
+
def write_data(writers_schema, logical_datum, encoder)
|
544
|
+
datum = writers_schema.type_adapter.encode(logical_datum)
|
545
|
+
|
546
|
+
unless Schema.validate(writers_schema, datum)
|
547
|
+
raise AvroTypeError.new(writers_schema, datum)
|
548
|
+
end
|
549
|
+
|
550
|
+
# function dispatch to write datum
|
551
|
+
case writers_schema.type_sym
|
552
|
+
when :null; encoder.write_null(datum)
|
553
|
+
when :boolean; encoder.write_boolean(datum)
|
554
|
+
when :string; encoder.write_string(datum)
|
555
|
+
when :int; encoder.write_int(datum)
|
556
|
+
when :long; encoder.write_long(datum)
|
557
|
+
when :float; encoder.write_float(datum)
|
558
|
+
when :double; encoder.write_double(datum)
|
559
|
+
when :bytes; encoder.write_bytes(datum)
|
560
|
+
when :fixed; write_fixed(writers_schema, datum, encoder)
|
561
|
+
when :enum; write_enum(writers_schema, datum, encoder)
|
562
|
+
when :array; write_array(writers_schema, datum, encoder)
|
563
|
+
when :map; write_map(writers_schema, datum, encoder)
|
564
|
+
when :union; write_union(writers_schema, datum, encoder)
|
565
|
+
when :record, :error, :request; write_record(writers_schema, datum, encoder)
|
566
|
+
else
|
567
|
+
raise AvroError.new("Unknown type: #{writers_schema.type}")
|
568
|
+
end
|
569
|
+
end
|
570
|
+
|
571
|
+
def write_fixed(writers_schema, datum, encoder)
|
572
|
+
encoder.write(datum)
|
573
|
+
end
|
574
|
+
|
575
|
+
def write_enum(writers_schema, datum, encoder)
|
576
|
+
index_of_datum = writers_schema.symbols.index(datum)
|
577
|
+
encoder.write_int(index_of_datum)
|
578
|
+
end
|
579
|
+
|
580
|
+
def write_array(writers_schema, datum, encoder)
|
581
|
+
if datum.size > 0
|
582
|
+
encoder.write_long(datum.size)
|
583
|
+
datum.each do |item|
|
584
|
+
write_data(writers_schema.items, item, encoder)
|
585
|
+
end
|
586
|
+
end
|
587
|
+
encoder.write_long(0)
|
588
|
+
end
|
589
|
+
|
590
|
+
def write_map(writers_schema, datum, encoder)
|
591
|
+
if datum.size > 0
|
592
|
+
encoder.write_long(datum.size)
|
593
|
+
datum.each do |k,v|
|
594
|
+
encoder.write_string(k)
|
595
|
+
write_data(writers_schema.values, v, encoder)
|
596
|
+
end
|
597
|
+
end
|
598
|
+
encoder.write_long(0)
|
599
|
+
end
|
600
|
+
|
601
|
+
def write_union(writers_schema, datum, encoder)
|
602
|
+
index_of_schema = -1
|
603
|
+
found = writers_schema.schemas.
|
604
|
+
find{|e| index_of_schema += 1; found = Schema.validate(e, datum) }
|
605
|
+
unless found # Because find_index doesn't exist in 1.8.6
|
606
|
+
raise AvroTypeError.new(writers_schema, datum)
|
607
|
+
end
|
608
|
+
encoder.write_long(index_of_schema)
|
609
|
+
write_data(writers_schema.schemas[index_of_schema], datum, encoder)
|
610
|
+
end
|
611
|
+
|
612
|
+
def write_record(writers_schema, datum, encoder)
|
613
|
+
writers_schema.fields.each do |field|
|
614
|
+
write_data(field.type, datum[field.name], encoder)
|
615
|
+
end
|
616
|
+
end
|
617
|
+
end # DatumWriter
|
618
|
+
end
|
619
|
+
end
|