avro-salsify-fork 1.9.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,366 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ require 'openssl'
18
+
19
+ module Avro
20
+ module DataFile
21
+ VERSION = 1
22
+ MAGIC = "Obj" + [VERSION].pack('c')
23
+ MAGIC.force_encoding('BINARY') if MAGIC.respond_to?(:force_encoding)
24
+ MAGIC_SIZE = MAGIC.respond_to?(:bytesize) ? MAGIC.bytesize : MAGIC.size
25
+ SYNC_SIZE = 16
26
+ SYNC_INTERVAL = 4000 * SYNC_SIZE
27
+ META_SCHEMA = Schema.parse('{"type": "map", "values": "bytes"}')
28
+ VALID_ENCODINGS = ['binary'] # not used yet
29
+
30
+ class DataFileError < AvroError; end
31
+
32
+ def self.open(file_path, mode='r', schema=nil, codec=nil)
33
+ schema = Avro::Schema.parse(schema) if schema
34
+ case mode
35
+ when 'w'
36
+ unless schema
37
+ raise DataFileError, "Writing an Avro file requires a schema."
38
+ end
39
+ io = open_writer(File.open(file_path, 'wb'), schema, codec)
40
+ when 'r'
41
+ io = open_reader(File.open(file_path, 'rb'), schema)
42
+ else
43
+ raise DataFileError, "Only modes 'r' and 'w' allowed. You gave #{mode.inspect}."
44
+ end
45
+
46
+ yield io if block_given?
47
+ io
48
+ ensure
49
+ io.close if block_given? && io
50
+ end
51
+
52
+ def self.codecs
53
+ @codecs
54
+ end
55
+
56
+ def self.register_codec(codec)
57
+ @codecs ||= {}
58
+ codec = codec.new if !codec.respond_to?(:codec_name) && codec.is_a?(Class)
59
+ @codecs[codec.codec_name.to_s] = codec
60
+ end
61
+
62
+ def self.get_codec(codec)
63
+ codec ||= 'null'
64
+ if codec.respond_to?(:compress) && codec.respond_to?(:decompress)
65
+ codec # it's a codec instance
66
+ elsif codec.is_a?(Class)
67
+ codec.new # it's a codec class
68
+ elsif @codecs.include?(codec.to_s)
69
+ @codecs[codec.to_s] # it's a string or symbol (codec name)
70
+ else
71
+ raise DataFileError, "Unknown codec: #{codec.inspect}"
72
+ end
73
+ end
74
+
75
+ class << self
76
+ private
77
+ def open_writer(file, schema, codec=nil)
78
+ writer = Avro::IO::DatumWriter.new(schema)
79
+ Avro::DataFile::Writer.new(file, writer, schema, codec)
80
+ end
81
+
82
+ def open_reader(file, schema)
83
+ reader = Avro::IO::DatumReader.new(nil, schema)
84
+ Avro::DataFile::Reader.new(file, reader)
85
+ end
86
+ end
87
+
88
+ class Writer
89
+ def self.generate_sync_marker
90
+ OpenSSL::Random.random_bytes(16)
91
+ end
92
+
93
+ attr_reader :writer, :encoder, :datum_writer, :buffer_writer, :buffer_encoder, :sync_marker, :meta, :codec
94
+ attr_accessor :block_count
95
+
96
+ def initialize(writer, datum_writer, writers_schema=nil, codec=nil, meta={})
97
+ # If writers_schema is not present, presume we're appending
98
+ @writer = writer
99
+ @encoder = IO::BinaryEncoder.new(@writer)
100
+ @datum_writer = datum_writer
101
+ @meta = meta
102
+ @buffer_writer = StringIO.new('', 'w')
103
+ @buffer_writer.set_encoding('BINARY') if @buffer_writer.respond_to?(:set_encoding)
104
+ @buffer_encoder = IO::BinaryEncoder.new(@buffer_writer)
105
+ @block_count = 0
106
+
107
+ if writers_schema
108
+ @sync_marker = Writer.generate_sync_marker
109
+ @codec = DataFile.get_codec(codec)
110
+ @meta['avro.codec'] = @codec.codec_name.to_s
111
+ @meta['avro.schema'] = writers_schema.to_s
112
+ datum_writer.writers_schema = writers_schema
113
+ write_header
114
+ else
115
+ # open writer for reading to collect metadata
116
+ dfr = Reader.new(writer, Avro::IO::DatumReader.new)
117
+
118
+ # FIXME(jmhodges): collect arbitrary metadata
119
+ # collect metadata
120
+ @sync_marker = dfr.sync_marker
121
+ @meta['avro.codec'] = dfr.meta['avro.codec']
122
+ @codec = DataFile.get_codec(meta['avro.codec'])
123
+
124
+ # get schema used to write existing file
125
+ schema_from_file = dfr.meta['avro.schema']
126
+ @meta['avro.schema'] = schema_from_file
127
+ datum_writer.writers_schema = Schema.parse(schema_from_file)
128
+
129
+ # seek to the end of the file and prepare for writing
130
+ writer.seek(0,2)
131
+ end
132
+ end
133
+
134
+ # Append a datum to the file
135
+ def <<(datum)
136
+ datum_writer.write(datum, buffer_encoder)
137
+ self.block_count += 1
138
+
139
+ # if the data to write is larger than the sync interval, write
140
+ # the block
141
+ if buffer_writer.tell >= SYNC_INTERVAL
142
+ write_block
143
+ end
144
+ end
145
+
146
+ # Return the current position as a value that may be passed to
147
+ # DataFileReader.seek(long). Forces the end of the current block,
148
+ # emitting a synchronization marker.
149
+ def sync
150
+ write_block
151
+ writer.tell
152
+ end
153
+
154
+ # Flush the current state of the file, including metadata
155
+ def flush
156
+ write_block
157
+ writer.flush
158
+ end
159
+
160
+ def close
161
+ flush
162
+ writer.close
163
+ end
164
+
165
+ private
166
+
167
+ def write_header
168
+ # write magic
169
+ writer.write(MAGIC)
170
+
171
+ # write metadata
172
+ datum_writer.write_data(META_SCHEMA, meta, encoder)
173
+
174
+ # write sync marker
175
+ writer.write(sync_marker)
176
+ end
177
+
178
+ # TODO(jmhodges): make a schema for blocks and use datum_writer
179
+ # TODO(jmhodges): do we really need the number of items in the block?
180
+ def write_block
181
+ if block_count > 0
182
+ # write number of items in block and block size in bytes
183
+ encoder.write_long(block_count)
184
+ to_write = codec.compress(buffer_writer.string)
185
+ encoder.write_long(to_write.respond_to?(:bytesize) ? to_write.bytesize : to_write.size)
186
+
187
+ # write block contents
188
+ writer.write(to_write)
189
+
190
+ # write sync marker
191
+ writer.write(sync_marker)
192
+
193
+ # reset buffer
194
+ buffer_writer.truncate(0)
195
+ buffer_writer.rewind
196
+ self.block_count = 0
197
+ end
198
+ end
199
+ end
200
+
201
+ # Read files written by DataFileWriter
202
+ class Reader
203
+ include ::Enumerable
204
+
205
+ # The reader and binary decoder for the raw file stream
206
+ attr_reader :reader, :decoder
207
+
208
+ # The binary decoder for the contents of a block (after codec decompression)
209
+ attr_reader :block_decoder
210
+
211
+ attr_reader :datum_reader, :sync_marker, :meta, :file_length, :codec
212
+ attr_accessor :block_count # records remaining in current block
213
+
214
+ def initialize(reader, datum_reader)
215
+ @reader = reader
216
+ @decoder = IO::BinaryDecoder.new(reader)
217
+ @datum_reader = datum_reader
218
+
219
+ # read the header: magic, meta, sync
220
+ read_header
221
+
222
+ @codec = DataFile.get_codec(meta['avro.codec'])
223
+
224
+ # get ready to read
225
+ @block_count = 0
226
+ datum_reader.writers_schema = Schema.parse meta['avro.schema']
227
+ end
228
+
229
+ # Iterates through each datum in this file
230
+ # TODO(jmhodges): handle block of length zero
231
+ def each
232
+ loop do
233
+ if block_count == 0
234
+ case
235
+ when eof?; break
236
+ when skip_sync
237
+ break if eof?
238
+ read_block_header
239
+ else
240
+ read_block_header
241
+ end
242
+ end
243
+
244
+ datum = datum_reader.read(block_decoder)
245
+ self.block_count -= 1
246
+ yield(datum)
247
+ end
248
+ end
249
+
250
+ def eof?; reader.eof?; end
251
+
252
+ def close
253
+ reader.close
254
+ end
255
+
256
+ private
257
+ def read_header
258
+ # seek to the beginning of the file to get magic block
259
+ reader.seek(0, 0)
260
+
261
+ # check magic number
262
+ magic_in_file = reader.read(MAGIC_SIZE)
263
+ if magic_in_file.size < MAGIC_SIZE
264
+ msg = 'Not an Avro data file: shorter than the Avro magic block'
265
+ raise DataFileError, msg
266
+ elsif magic_in_file != MAGIC
267
+ msg = "Not an Avro data file: #{magic_in_file.inspect} doesn't match #{MAGIC.inspect}"
268
+ raise DataFileError, msg
269
+ end
270
+
271
+ # read metadata
272
+ @meta = datum_reader.read_data(META_SCHEMA,
273
+ META_SCHEMA,
274
+ decoder)
275
+ # read sync marker
276
+ @sync_marker = reader.read(SYNC_SIZE)
277
+ end
278
+
279
+ def read_block_header
280
+ self.block_count = decoder.read_long
281
+ block_bytes = decoder.read_long
282
+ data = codec.decompress(reader.read(block_bytes))
283
+ @block_decoder = IO::BinaryDecoder.new(StringIO.new(data))
284
+ end
285
+
286
+ # read the length of the sync marker; if it matches the sync
287
+ # marker, return true. Otherwise, seek back to where we started
288
+ # and return false
289
+ def skip_sync
290
+ proposed_sync_marker = reader.read(SYNC_SIZE)
291
+ if proposed_sync_marker != sync_marker
292
+ reader.seek(-SYNC_SIZE, 1)
293
+ false
294
+ else
295
+ true
296
+ end
297
+ end
298
+ end
299
+
300
+
301
+ class NullCodec
302
+ def codec_name; 'null'; end
303
+ def decompress(data); data; end
304
+ def compress(data); data; end
305
+ end
306
+
307
+ class DeflateCodec
308
+ attr_reader :level
309
+
310
+ def initialize(level=Zlib::DEFAULT_COMPRESSION)
311
+ @level = level
312
+ end
313
+
314
+ def codec_name; 'deflate'; end
315
+
316
+ def decompress(compressed)
317
+ # Passing a negative number to Inflate puts it into "raw" RFC1951 mode
318
+ # (without the RFC1950 header & checksum). See the docs for
319
+ # inflateInit2 in http://www.zlib.net/manual.html
320
+ zstream = Zlib::Inflate.new(-Zlib::MAX_WBITS)
321
+ data = zstream.inflate(compressed)
322
+ data << zstream.finish
323
+ ensure
324
+ zstream.close
325
+ end
326
+
327
+ def compress(data)
328
+ zstream = Zlib::Deflate.new(level, -Zlib::MAX_WBITS)
329
+ compressed = zstream.deflate(data)
330
+ compressed << zstream.finish
331
+ ensure
332
+ zstream.close
333
+ end
334
+ end
335
+
336
+ class SnappyCodec
337
+ def codec_name; 'snappy'; end
338
+
339
+ def decompress(data)
340
+ load_snappy!
341
+ Snappy.inflate(data)
342
+ end
343
+
344
+ def compress(data)
345
+ load_snappy!
346
+ Snappy.deflate(data)
347
+ end
348
+
349
+ private
350
+
351
+ def load_snappy!
352
+ require 'snappy' unless defined?(Snappy)
353
+ rescue LoadError
354
+ raise LoadError, "Snappy compression is not available, please install the `snappy` gem."
355
+ end
356
+ end
357
+
358
+ DataFile.register_codec NullCodec
359
+ DataFile.register_codec DeflateCodec
360
+ DataFile.register_codec SnappyCodec
361
+
362
+ # TODO this constant won't be updated if you register another codec.
363
+ # Deprecated in favor of Avro::DataFile::codecs
364
+ VALID_CODECS = DataFile.codecs.keys
365
+ end
366
+ end
@@ -0,0 +1,619 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ module Avro
18
+ module IO
19
+ # Raised when datum is not an example of schema
20
+ class AvroTypeError < AvroError
21
+ def initialize(expected_schema, datum)
22
+ super("The datum #{datum.inspect} is not an example of schema #{expected_schema}")
23
+ end
24
+ end
25
+
26
+ # Raised when writer's and reader's schema do not match
27
+ class SchemaMatchException < AvroError
28
+ def initialize(writers_schema, readers_schema)
29
+ super("Writer's schema #{writers_schema} and Reader's schema " +
30
+ "#{readers_schema} do not match.")
31
+ end
32
+ end
33
+
34
+ # FIXME(jmhodges) move validate to this module?
35
+
36
+ class BinaryDecoder
37
+ # Read leaf values
38
+
39
+ # reader is an object on which we can call read, seek and tell.
40
+ attr_reader :reader
41
+ def initialize(reader)
42
+ @reader = reader
43
+ end
44
+
45
+ def byte!
46
+ @reader.read(1).unpack('C').first
47
+ end
48
+
49
+ def read_null
50
+ # null is written as zero byte's
51
+ nil
52
+ end
53
+
54
+ def read_boolean
55
+ byte! == 1
56
+ end
57
+
58
+ def read_int; read_long; end
59
+
60
+ def read_long
61
+ # int and long values are written using variable-length,
62
+ # zig-zag coding.
63
+ b = byte!
64
+ n = b & 0x7F
65
+ shift = 7
66
+ while (b & 0x80) != 0
67
+ b = byte!
68
+ n |= (b & 0x7F) << shift
69
+ shift += 7
70
+ end
71
+ (n >> 1) ^ -(n & 1)
72
+ end
73
+
74
+ def read_float
75
+ # A float is written as 4 bytes.
76
+ # The float is converted into a 32-bit integer using a method
77
+ # equivalent to Java's floatToIntBits and then encoded in
78
+ # little-endian format.
79
+ @reader.read(4).unpack('e')[0]
80
+ end
81
+
82
+ def read_double
83
+ # A double is written as 8 bytes.
84
+ # The double is converted into a 64-bit integer using a method
85
+ # equivalent to Java's doubleToLongBits and then encoded in
86
+ # little-endian format.
87
+ @reader.read(8).unpack('E')[0]
88
+ end
89
+
90
+ def read_bytes
91
+ # Bytes are encoded as a long followed by that many bytes of
92
+ # data.
93
+ read(read_long)
94
+ end
95
+
96
+ def read_string
97
+ # A string is encoded as a long followed by that many bytes of
98
+ # UTF-8 encoded character data.
99
+ read_bytes.tap do |string|
100
+ string.force_encoding("UTF-8") if string.respond_to? :force_encoding
101
+ end
102
+ end
103
+
104
+ def read(len)
105
+ # Read n bytes
106
+ @reader.read(len)
107
+ end
108
+
109
+ def skip_null
110
+ nil
111
+ end
112
+
113
+ def skip_boolean
114
+ skip(1)
115
+ end
116
+
117
+ def skip_int
118
+ skip_long
119
+ end
120
+
121
+ def skip_long
122
+ b = byte!
123
+ while (b & 0x80) != 0
124
+ b = byte!
125
+ end
126
+ end
127
+
128
+ def skip_float
129
+ skip(4)
130
+ end
131
+
132
+ def skip_double
133
+ skip(8)
134
+ end
135
+
136
+ def skip_bytes
137
+ skip(read_long)
138
+ end
139
+
140
+ def skip_string
141
+ skip_bytes
142
+ end
143
+
144
+ def skip(n)
145
+ reader.seek(reader.tell() + n)
146
+ end
147
+ end
148
+
149
+ # Write leaf values
150
+ class BinaryEncoder
151
+ attr_reader :writer
152
+
153
+ def initialize(writer)
154
+ @writer = writer
155
+ end
156
+
157
+ # null is written as zero bytes
158
+ def write_null(datum)
159
+ nil
160
+ end
161
+
162
+ # a boolean is written as a single byte
163
+ # whose value is either 0 (false) or 1 (true).
164
+ def write_boolean(datum)
165
+ on_disk = datum ? 1.chr : 0.chr
166
+ writer.write(on_disk)
167
+ end
168
+
169
+ # int and long values are written using variable-length,
170
+ # zig-zag coding.
171
+ def write_int(n)
172
+ write_long(n)
173
+ end
174
+
175
+ # int and long values are written using variable-length,
176
+ # zig-zag coding.
177
+ def write_long(n)
178
+ foo = n
179
+ n = (n << 1) ^ (n >> 63)
180
+ while (n & ~0x7F) != 0
181
+ @writer.write(((n & 0x7f) | 0x80).chr)
182
+ n >>= 7
183
+ end
184
+ @writer.write(n.chr)
185
+ end
186
+
187
+ # A float is written as 4 bytes.
188
+ # The float is converted into a 32-bit integer using a method
189
+ # equivalent to Java's floatToIntBits and then encoded in
190
+ # little-endian format.
191
+ def write_float(datum)
192
+ @writer.write([datum].pack('e'))
193
+ end
194
+
195
+ # A double is written as 8 bytes.
196
+ # The double is converted into a 64-bit integer using a method
197
+ # equivalent to Java's doubleToLongBits and then encoded in
198
+ # little-endian format.
199
+ def write_double(datum)
200
+ @writer.write([datum].pack('E'))
201
+ end
202
+
203
+ # Bytes are encoded as a long followed by that many bytes of data.
204
+ def write_bytes(datum)
205
+ write_long(datum.bytesize)
206
+ @writer.write(datum)
207
+ end
208
+
209
+ # A string is encoded as a long followed by that many bytes of
210
+ # UTF-8 encoded character data
211
+ def write_string(datum)
212
+ datum = datum.encode('utf-8') if datum.respond_to? :encode
213
+ write_bytes(datum)
214
+ end
215
+
216
+ # Write an arbritary datum.
217
+ def write(datum)
218
+ writer.write(datum)
219
+ end
220
+ end
221
+
222
+ class DatumReader
223
+ def self.match_schemas(writers_schema, readers_schema)
224
+ w_type = writers_schema.type_sym
225
+ r_type = readers_schema.type_sym
226
+
227
+ # This conditional is begging for some OO love.
228
+ if w_type == :union || r_type == :union
229
+ return true
230
+ end
231
+
232
+ if w_type == r_type
233
+ return true if Schema::PRIMITIVE_TYPES_SYM.include?(r_type)
234
+
235
+ case r_type
236
+ when :record
237
+ return writers_schema.fullname == readers_schema.fullname
238
+ when :error
239
+ return writers_schema.fullname == readers_schema.fullname
240
+ when :request
241
+ return true
242
+ when :fixed
243
+ return writers_schema.fullname == readers_schema.fullname &&
244
+ writers_schema.size == readers_schema.size
245
+ when :enum
246
+ return writers_schema.fullname == readers_schema.fullname
247
+ when :map
248
+ return writers_schema.values.type == readers_schema.values.type
249
+ when :array
250
+ return writers_schema.items.type == readers_schema.items.type
251
+ end
252
+ end
253
+
254
+ # Handle schema promotion
255
+ if w_type == :int && [:long, :float, :double].include?(r_type)
256
+ return true
257
+ elsif w_type == :long && [:float, :double].include?(r_type)
258
+ return true
259
+ elsif w_type == :float && r_type == :double
260
+ return true
261
+ end
262
+
263
+ return false
264
+ end
265
+
266
+ attr_accessor :writers_schema, :readers_schema
267
+
268
+ def initialize(writers_schema=nil, readers_schema=nil)
269
+ @writers_schema = writers_schema
270
+ @readers_schema = readers_schema
271
+ end
272
+
273
+ def read(decoder)
274
+ self.readers_schema = writers_schema unless readers_schema
275
+ read_data(writers_schema, readers_schema, decoder)
276
+ end
277
+
278
+ def read_data(writers_schema, readers_schema, decoder)
279
+ # schema matching
280
+ unless self.class.match_schemas(writers_schema, readers_schema)
281
+ raise SchemaMatchException.new(writers_schema, readers_schema)
282
+ end
283
+
284
+ # schema resolution: reader's schema is a union, writer's
285
+ # schema is not
286
+ if writers_schema.type_sym != :union && readers_schema.type_sym == :union
287
+ rs = readers_schema.schemas.find{|s|
288
+ self.class.match_schemas(writers_schema, s)
289
+ }
290
+ return read_data(writers_schema, rs, decoder) if rs
291
+ raise SchemaMatchException.new(writers_schema, readers_schema)
292
+ end
293
+
294
+ # function dispatch for reading data based on type of writer's
295
+ # schema
296
+ datum = case writers_schema.type_sym
297
+ when :null; decoder.read_null
298
+ when :boolean; decoder.read_boolean
299
+ when :string; decoder.read_string
300
+ when :int; decoder.read_int
301
+ when :long; decoder.read_long
302
+ when :float; decoder.read_float
303
+ when :double; decoder.read_double
304
+ when :bytes; decoder.read_bytes
305
+ when :fixed; read_fixed(writers_schema, readers_schema, decoder)
306
+ when :enum; read_enum(writers_schema, readers_schema, decoder)
307
+ when :array; read_array(writers_schema, readers_schema, decoder)
308
+ when :map; read_map(writers_schema, readers_schema, decoder)
309
+ when :union; read_union(writers_schema, readers_schema, decoder)
310
+ when :record, :error, :request; read_record(writers_schema, readers_schema, decoder)
311
+ else
312
+ raise AvroError, "Cannot read unknown schema type: #{writers_schema.type}"
313
+ end
314
+
315
+ readers_schema.type_adapter.decode(datum)
316
+ end
317
+
318
+ def read_fixed(writers_schema, readers_schema, decoder)
319
+ decoder.read(writers_schema.size)
320
+ end
321
+
322
+ def read_enum(writers_schema, readers_schema, decoder)
323
+ index_of_symbol = decoder.read_int
324
+ read_symbol = writers_schema.symbols[index_of_symbol]
325
+
326
+ # TODO(jmhodges): figure out what unset means for resolution
327
+ # schema resolution
328
+ unless readers_schema.symbols.include?(read_symbol)
329
+ # 'unset' here
330
+ end
331
+
332
+ read_symbol
333
+ end
334
+
335
+ def read_array(writers_schema, readers_schema, decoder)
336
+ read_items = []
337
+ block_count = decoder.read_long
338
+ while block_count != 0
339
+ if block_count < 0
340
+ block_count = -block_count
341
+ block_size = decoder.read_long
342
+ end
343
+ block_count.times do
344
+ read_items << read_data(writers_schema.items,
345
+ readers_schema.items,
346
+ decoder)
347
+ end
348
+ block_count = decoder.read_long
349
+ end
350
+
351
+ read_items
352
+ end
353
+
354
+ def read_map(writers_schema, readers_schema, decoder)
355
+ read_items = {}
356
+ block_count = decoder.read_long
357
+ while block_count != 0
358
+ if block_count < 0
359
+ block_count = -block_count
360
+ block_size = decoder.read_long
361
+ end
362
+ block_count.times do
363
+ key = decoder.read_string
364
+ read_items[key] = read_data(writers_schema.values,
365
+ readers_schema.values,
366
+ decoder)
367
+ end
368
+ block_count = decoder.read_long
369
+ end
370
+
371
+ read_items
372
+ end
373
+
374
+ def read_union(writers_schema, readers_schema, decoder)
375
+ index_of_schema = decoder.read_long
376
+ selected_writers_schema = writers_schema.schemas[index_of_schema]
377
+
378
+ read_data(selected_writers_schema, readers_schema, decoder)
379
+ end
380
+
381
+ def read_record(writers_schema, readers_schema, decoder)
382
+ readers_fields_hash = readers_schema.fields_hash
383
+ read_record = {}
384
+ writers_schema.fields.each do |field|
385
+ if readers_field = readers_fields_hash[field.name]
386
+ field_val = read_data(field.type, readers_field.type, decoder)
387
+ read_record[field.name] = field_val
388
+ else
389
+ skip_data(field.type, decoder)
390
+ end
391
+ end
392
+
393
+ # fill in the default values
394
+ if readers_fields_hash.size > read_record.size
395
+ writers_fields_hash = writers_schema.fields_hash
396
+ readers_fields_hash.each do |field_name, field|
397
+ unless writers_fields_hash.has_key? field_name
398
+ if !field.default.nil?
399
+ field_val = read_default_value(field.type, field.default)
400
+ read_record[field.name] = field_val
401
+ else
402
+ # FIXME(jmhodges) another 'unset' here
403
+ end
404
+ end
405
+ end
406
+ end
407
+
408
+ read_record
409
+ end
410
+
411
+ def read_default_value(field_schema, default_value)
412
+ # Basically a JSON Decoder?
413
+ case field_schema.type_sym
414
+ when :null
415
+ return nil
416
+ when :boolean
417
+ return default_value
418
+ when :int, :long
419
+ return Integer(default_value)
420
+ when :float, :double
421
+ return Float(default_value)
422
+ when :enum, :fixed, :string, :bytes
423
+ return default_value
424
+ when :array
425
+ read_array = []
426
+ default_value.each do |json_val|
427
+ item_val = read_default_value(field_schema.items, json_val)
428
+ read_array << item_val
429
+ end
430
+ return read_array
431
+ when :map
432
+ read_map = {}
433
+ default_value.each do |key, json_val|
434
+ map_val = read_default_value(field_schema.values, json_val)
435
+ read_map[key] = map_val
436
+ end
437
+ return read_map
438
+ when :union
439
+ return read_default_value(field_schema.schemas[0], default_value)
440
+ when :record, :error
441
+ read_record = {}
442
+ field_schema.fields.each do |field|
443
+ json_val = default_value[field.name]
444
+ json_val = field.default unless json_val
445
+ field_val = read_default_value(field.type, json_val)
446
+ read_record[field.name] = field_val
447
+ end
448
+ return read_record
449
+ else
450
+ fail_msg = "Unknown type: #{field_schema.type}"
451
+ raise AvroError, fail_msg
452
+ end
453
+ end
454
+
455
+ def skip_data(writers_schema, decoder)
456
+ case writers_schema.type_sym
457
+ when :null
458
+ decoder.skip_null
459
+ when :boolean
460
+ decoder.skip_boolean
461
+ when :string
462
+ decoder.skip_string
463
+ when :int
464
+ decoder.skip_int
465
+ when :long
466
+ decoder.skip_long
467
+ when :float
468
+ decoder.skip_float
469
+ when :double
470
+ decoder.skip_double
471
+ when :bytes
472
+ decoder.skip_bytes
473
+ when :fixed
474
+ skip_fixed(writers_schema, decoder)
475
+ when :enum
476
+ skip_enum(writers_schema, decoder)
477
+ when :array
478
+ skip_array(writers_schema, decoder)
479
+ when :map
480
+ skip_map(writers_schema, decoder)
481
+ when :union
482
+ skip_union(writers_schema, decoder)
483
+ when :record, :error, :request
484
+ skip_record(writers_schema, decoder)
485
+ else
486
+ raise AvroError, "Unknown schema type: #{writers_schema.type}"
487
+ end
488
+ end
489
+
490
+ def skip_fixed(writers_schema, decoder)
491
+ decoder.skip(writers_schema.size)
492
+ end
493
+
494
+ def skip_enum(writers_schema, decoder)
495
+ decoder.skip_int
496
+ end
497
+
498
+ def skip_union(writers_schema, decoder)
499
+ index = decoder.read_long
500
+ skip_data(writers_schema.schemas[index], decoder)
501
+ end
502
+
503
+ def skip_array(writers_schema, decoder)
504
+ skip_blocks(decoder) { skip_data(writers_schema.items, decoder) }
505
+ end
506
+
507
+ def skip_map(writers_schema, decoder)
508
+ skip_blocks(decoder) {
509
+ decoder.skip_string
510
+ skip_data(writers_schema.values, decoder)
511
+ }
512
+ end
513
+
514
+ def skip_record(writers_schema, decoder)
515
+ writers_schema.fields.each{|f| skip_data(f.type, decoder) }
516
+ end
517
+
518
+ private
519
+ def skip_blocks(decoder, &blk)
520
+ block_count = decoder.read_long
521
+ while block_count != 0
522
+ if block_count < 0
523
+ decoder.skip(decoder.read_long)
524
+ else
525
+ block_count.times &blk
526
+ end
527
+ block_count = decoder.read_long
528
+ end
529
+ end
530
+ end # DatumReader
531
+
532
+ # DatumWriter for generic ruby objects
533
+ class DatumWriter
534
+ attr_accessor :writers_schema
535
+ def initialize(writers_schema=nil)
536
+ @writers_schema = writers_schema
537
+ end
538
+
539
+ def write(datum, encoder)
540
+ write_data(writers_schema, datum, encoder)
541
+ end
542
+
543
+ def write_data(writers_schema, logical_datum, encoder)
544
+ datum = writers_schema.type_adapter.encode(logical_datum)
545
+
546
+ unless Schema.validate(writers_schema, datum)
547
+ raise AvroTypeError.new(writers_schema, datum)
548
+ end
549
+
550
+ # function dispatch to write datum
551
+ case writers_schema.type_sym
552
+ when :null; encoder.write_null(datum)
553
+ when :boolean; encoder.write_boolean(datum)
554
+ when :string; encoder.write_string(datum)
555
+ when :int; encoder.write_int(datum)
556
+ when :long; encoder.write_long(datum)
557
+ when :float; encoder.write_float(datum)
558
+ when :double; encoder.write_double(datum)
559
+ when :bytes; encoder.write_bytes(datum)
560
+ when :fixed; write_fixed(writers_schema, datum, encoder)
561
+ when :enum; write_enum(writers_schema, datum, encoder)
562
+ when :array; write_array(writers_schema, datum, encoder)
563
+ when :map; write_map(writers_schema, datum, encoder)
564
+ when :union; write_union(writers_schema, datum, encoder)
565
+ when :record, :error, :request; write_record(writers_schema, datum, encoder)
566
+ else
567
+ raise AvroError.new("Unknown type: #{writers_schema.type}")
568
+ end
569
+ end
570
+
571
+ def write_fixed(writers_schema, datum, encoder)
572
+ encoder.write(datum)
573
+ end
574
+
575
+ def write_enum(writers_schema, datum, encoder)
576
+ index_of_datum = writers_schema.symbols.index(datum)
577
+ encoder.write_int(index_of_datum)
578
+ end
579
+
580
+ def write_array(writers_schema, datum, encoder)
581
+ if datum.size > 0
582
+ encoder.write_long(datum.size)
583
+ datum.each do |item|
584
+ write_data(writers_schema.items, item, encoder)
585
+ end
586
+ end
587
+ encoder.write_long(0)
588
+ end
589
+
590
+ def write_map(writers_schema, datum, encoder)
591
+ if datum.size > 0
592
+ encoder.write_long(datum.size)
593
+ datum.each do |k,v|
594
+ encoder.write_string(k)
595
+ write_data(writers_schema.values, v, encoder)
596
+ end
597
+ end
598
+ encoder.write_long(0)
599
+ end
600
+
601
+ def write_union(writers_schema, datum, encoder)
602
+ index_of_schema = -1
603
+ found = writers_schema.schemas.
604
+ find{|e| index_of_schema += 1; found = Schema.validate(e, datum) }
605
+ unless found # Because find_index doesn't exist in 1.8.6
606
+ raise AvroTypeError.new(writers_schema, datum)
607
+ end
608
+ encoder.write_long(index_of_schema)
609
+ write_data(writers_schema.schemas[index_of_schema], datum, encoder)
610
+ end
611
+
612
+ def write_record(writers_schema, datum, encoder)
613
+ writers_schema.fields.each do |field|
614
+ write_data(field.type, datum[field.name], encoder)
615
+ end
616
+ end
617
+ end # DatumWriter
618
+ end
619
+ end