avro-salsify-fork 1.9.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,366 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ require 'openssl'
18
+
19
+ module Avro
20
+ module DataFile
21
+ VERSION = 1
22
+ MAGIC = "Obj" + [VERSION].pack('c')
23
+ MAGIC.force_encoding('BINARY') if MAGIC.respond_to?(:force_encoding)
24
+ MAGIC_SIZE = MAGIC.respond_to?(:bytesize) ? MAGIC.bytesize : MAGIC.size
25
+ SYNC_SIZE = 16
26
+ SYNC_INTERVAL = 4000 * SYNC_SIZE
27
+ META_SCHEMA = Schema.parse('{"type": "map", "values": "bytes"}')
28
+ VALID_ENCODINGS = ['binary'] # not used yet
29
+
30
+ class DataFileError < AvroError; end
31
+
32
+ def self.open(file_path, mode='r', schema=nil, codec=nil)
33
+ schema = Avro::Schema.parse(schema) if schema
34
+ case mode
35
+ when 'w'
36
+ unless schema
37
+ raise DataFileError, "Writing an Avro file requires a schema."
38
+ end
39
+ io = open_writer(File.open(file_path, 'wb'), schema, codec)
40
+ when 'r'
41
+ io = open_reader(File.open(file_path, 'rb'), schema)
42
+ else
43
+ raise DataFileError, "Only modes 'r' and 'w' allowed. You gave #{mode.inspect}."
44
+ end
45
+
46
+ yield io if block_given?
47
+ io
48
+ ensure
49
+ io.close if block_given? && io
50
+ end
51
+
52
+ def self.codecs
53
+ @codecs
54
+ end
55
+
56
+ def self.register_codec(codec)
57
+ @codecs ||= {}
58
+ codec = codec.new if !codec.respond_to?(:codec_name) && codec.is_a?(Class)
59
+ @codecs[codec.codec_name.to_s] = codec
60
+ end
61
+
62
+ def self.get_codec(codec)
63
+ codec ||= 'null'
64
+ if codec.respond_to?(:compress) && codec.respond_to?(:decompress)
65
+ codec # it's a codec instance
66
+ elsif codec.is_a?(Class)
67
+ codec.new # it's a codec class
68
+ elsif @codecs.include?(codec.to_s)
69
+ @codecs[codec.to_s] # it's a string or symbol (codec name)
70
+ else
71
+ raise DataFileError, "Unknown codec: #{codec.inspect}"
72
+ end
73
+ end
74
+
75
+ class << self
76
+ private
77
+ def open_writer(file, schema, codec=nil)
78
+ writer = Avro::IO::DatumWriter.new(schema)
79
+ Avro::DataFile::Writer.new(file, writer, schema, codec)
80
+ end
81
+
82
+ def open_reader(file, schema)
83
+ reader = Avro::IO::DatumReader.new(nil, schema)
84
+ Avro::DataFile::Reader.new(file, reader)
85
+ end
86
+ end
87
+
88
+ class Writer
89
+ def self.generate_sync_marker
90
+ OpenSSL::Random.random_bytes(16)
91
+ end
92
+
93
+ attr_reader :writer, :encoder, :datum_writer, :buffer_writer, :buffer_encoder, :sync_marker, :meta, :codec
94
+ attr_accessor :block_count
95
+
96
+ def initialize(writer, datum_writer, writers_schema=nil, codec=nil, meta={})
97
+ # If writers_schema is not present, presume we're appending
98
+ @writer = writer
99
+ @encoder = IO::BinaryEncoder.new(@writer)
100
+ @datum_writer = datum_writer
101
+ @meta = meta
102
+ @buffer_writer = StringIO.new('', 'w')
103
+ @buffer_writer.set_encoding('BINARY') if @buffer_writer.respond_to?(:set_encoding)
104
+ @buffer_encoder = IO::BinaryEncoder.new(@buffer_writer)
105
+ @block_count = 0
106
+
107
+ if writers_schema
108
+ @sync_marker = Writer.generate_sync_marker
109
+ @codec = DataFile.get_codec(codec)
110
+ @meta['avro.codec'] = @codec.codec_name.to_s
111
+ @meta['avro.schema'] = writers_schema.to_s
112
+ datum_writer.writers_schema = writers_schema
113
+ write_header
114
+ else
115
+ # open writer for reading to collect metadata
116
+ dfr = Reader.new(writer, Avro::IO::DatumReader.new)
117
+
118
+ # FIXME(jmhodges): collect arbitrary metadata
119
+ # collect metadata
120
+ @sync_marker = dfr.sync_marker
121
+ @meta['avro.codec'] = dfr.meta['avro.codec']
122
+ @codec = DataFile.get_codec(meta['avro.codec'])
123
+
124
+ # get schema used to write existing file
125
+ schema_from_file = dfr.meta['avro.schema']
126
+ @meta['avro.schema'] = schema_from_file
127
+ datum_writer.writers_schema = Schema.parse(schema_from_file)
128
+
129
+ # seek to the end of the file and prepare for writing
130
+ writer.seek(0,2)
131
+ end
132
+ end
133
+
134
+ # Append a datum to the file
135
+ def <<(datum)
136
+ datum_writer.write(datum, buffer_encoder)
137
+ self.block_count += 1
138
+
139
+ # if the data to write is larger than the sync interval, write
140
+ # the block
141
+ if buffer_writer.tell >= SYNC_INTERVAL
142
+ write_block
143
+ end
144
+ end
145
+
146
+ # Return the current position as a value that may be passed to
147
+ # DataFileReader.seek(long). Forces the end of the current block,
148
+ # emitting a synchronization marker.
149
+ def sync
150
+ write_block
151
+ writer.tell
152
+ end
153
+
154
+ # Flush the current state of the file, including metadata
155
+ def flush
156
+ write_block
157
+ writer.flush
158
+ end
159
+
160
+ def close
161
+ flush
162
+ writer.close
163
+ end
164
+
165
+ private
166
+
167
+ def write_header
168
+ # write magic
169
+ writer.write(MAGIC)
170
+
171
+ # write metadata
172
+ datum_writer.write_data(META_SCHEMA, meta, encoder)
173
+
174
+ # write sync marker
175
+ writer.write(sync_marker)
176
+ end
177
+
178
+ # TODO(jmhodges): make a schema for blocks and use datum_writer
179
+ # TODO(jmhodges): do we really need the number of items in the block?
180
+ def write_block
181
+ if block_count > 0
182
+ # write number of items in block and block size in bytes
183
+ encoder.write_long(block_count)
184
+ to_write = codec.compress(buffer_writer.string)
185
+ encoder.write_long(to_write.respond_to?(:bytesize) ? to_write.bytesize : to_write.size)
186
+
187
+ # write block contents
188
+ writer.write(to_write)
189
+
190
+ # write sync marker
191
+ writer.write(sync_marker)
192
+
193
+ # reset buffer
194
+ buffer_writer.truncate(0)
195
+ buffer_writer.rewind
196
+ self.block_count = 0
197
+ end
198
+ end
199
+ end
200
+
201
+ # Read files written by DataFileWriter
202
+ class Reader
203
+ include ::Enumerable
204
+
205
+ # The reader and binary decoder for the raw file stream
206
+ attr_reader :reader, :decoder
207
+
208
+ # The binary decoder for the contents of a block (after codec decompression)
209
+ attr_reader :block_decoder
210
+
211
+ attr_reader :datum_reader, :sync_marker, :meta, :file_length, :codec
212
+ attr_accessor :block_count # records remaining in current block
213
+
214
+ def initialize(reader, datum_reader)
215
+ @reader = reader
216
+ @decoder = IO::BinaryDecoder.new(reader)
217
+ @datum_reader = datum_reader
218
+
219
+ # read the header: magic, meta, sync
220
+ read_header
221
+
222
+ @codec = DataFile.get_codec(meta['avro.codec'])
223
+
224
+ # get ready to read
225
+ @block_count = 0
226
+ datum_reader.writers_schema = Schema.parse meta['avro.schema']
227
+ end
228
+
229
+ # Iterates through each datum in this file
230
+ # TODO(jmhodges): handle block of length zero
231
+ def each
232
+ loop do
233
+ if block_count == 0
234
+ case
235
+ when eof?; break
236
+ when skip_sync
237
+ break if eof?
238
+ read_block_header
239
+ else
240
+ read_block_header
241
+ end
242
+ end
243
+
244
+ datum = datum_reader.read(block_decoder)
245
+ self.block_count -= 1
246
+ yield(datum)
247
+ end
248
+ end
249
+
250
+ def eof?; reader.eof?; end
251
+
252
+ def close
253
+ reader.close
254
+ end
255
+
256
+ private
257
+ def read_header
258
+ # seek to the beginning of the file to get magic block
259
+ reader.seek(0, 0)
260
+
261
+ # check magic number
262
+ magic_in_file = reader.read(MAGIC_SIZE)
263
+ if magic_in_file.size < MAGIC_SIZE
264
+ msg = 'Not an Avro data file: shorter than the Avro magic block'
265
+ raise DataFileError, msg
266
+ elsif magic_in_file != MAGIC
267
+ msg = "Not an Avro data file: #{magic_in_file.inspect} doesn't match #{MAGIC.inspect}"
268
+ raise DataFileError, msg
269
+ end
270
+
271
+ # read metadata
272
+ @meta = datum_reader.read_data(META_SCHEMA,
273
+ META_SCHEMA,
274
+ decoder)
275
+ # read sync marker
276
+ @sync_marker = reader.read(SYNC_SIZE)
277
+ end
278
+
279
+ def read_block_header
280
+ self.block_count = decoder.read_long
281
+ block_bytes = decoder.read_long
282
+ data = codec.decompress(reader.read(block_bytes))
283
+ @block_decoder = IO::BinaryDecoder.new(StringIO.new(data))
284
+ end
285
+
286
+ # read the length of the sync marker; if it matches the sync
287
+ # marker, return true. Otherwise, seek back to where we started
288
+ # and return false
289
+ def skip_sync
290
+ proposed_sync_marker = reader.read(SYNC_SIZE)
291
+ if proposed_sync_marker != sync_marker
292
+ reader.seek(-SYNC_SIZE, 1)
293
+ false
294
+ else
295
+ true
296
+ end
297
+ end
298
+ end
299
+
300
+
301
+ class NullCodec
302
+ def codec_name; 'null'; end
303
+ def decompress(data); data; end
304
+ def compress(data); data; end
305
+ end
306
+
307
+ class DeflateCodec
308
+ attr_reader :level
309
+
310
+ def initialize(level=Zlib::DEFAULT_COMPRESSION)
311
+ @level = level
312
+ end
313
+
314
+ def codec_name; 'deflate'; end
315
+
316
+ def decompress(compressed)
317
+ # Passing a negative number to Inflate puts it into "raw" RFC1951 mode
318
+ # (without the RFC1950 header & checksum). See the docs for
319
+ # inflateInit2 in http://www.zlib.net/manual.html
320
+ zstream = Zlib::Inflate.new(-Zlib::MAX_WBITS)
321
+ data = zstream.inflate(compressed)
322
+ data << zstream.finish
323
+ ensure
324
+ zstream.close
325
+ end
326
+
327
+ def compress(data)
328
+ zstream = Zlib::Deflate.new(level, -Zlib::MAX_WBITS)
329
+ compressed = zstream.deflate(data)
330
+ compressed << zstream.finish
331
+ ensure
332
+ zstream.close
333
+ end
334
+ end
335
+
336
+ class SnappyCodec
337
+ def codec_name; 'snappy'; end
338
+
339
+ def decompress(data)
340
+ load_snappy!
341
+ Snappy.inflate(data)
342
+ end
343
+
344
+ def compress(data)
345
+ load_snappy!
346
+ Snappy.deflate(data)
347
+ end
348
+
349
+ private
350
+
351
+ def load_snappy!
352
+ require 'snappy' unless defined?(Snappy)
353
+ rescue LoadError
354
+ raise LoadError, "Snappy compression is not available, please install the `snappy` gem."
355
+ end
356
+ end
357
+
358
+ DataFile.register_codec NullCodec
359
+ DataFile.register_codec DeflateCodec
360
+ DataFile.register_codec SnappyCodec
361
+
362
+ # TODO this constant won't be updated if you register another codec.
363
+ # Deprecated in favor of Avro::DataFile::codecs
364
+ VALID_CODECS = DataFile.codecs.keys
365
+ end
366
+ end
@@ -0,0 +1,619 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ module Avro
18
+ module IO
19
+ # Raised when datum is not an example of schema
20
+ class AvroTypeError < AvroError
21
+ def initialize(expected_schema, datum)
22
+ super("The datum #{datum.inspect} is not an example of schema #{expected_schema}")
23
+ end
24
+ end
25
+
26
+ # Raised when writer's and reader's schema do not match
27
+ class SchemaMatchException < AvroError
28
+ def initialize(writers_schema, readers_schema)
29
+ super("Writer's schema #{writers_schema} and Reader's schema " +
30
+ "#{readers_schema} do not match.")
31
+ end
32
+ end
33
+
34
+ # FIXME(jmhodges) move validate to this module?
35
+
36
+ class BinaryDecoder
37
+ # Read leaf values
38
+
39
+ # reader is an object on which we can call read, seek and tell.
40
+ attr_reader :reader
41
+ def initialize(reader)
42
+ @reader = reader
43
+ end
44
+
45
+ def byte!
46
+ @reader.read(1).unpack('C').first
47
+ end
48
+
49
+ def read_null
50
+ # null is written as zero byte's
51
+ nil
52
+ end
53
+
54
+ def read_boolean
55
+ byte! == 1
56
+ end
57
+
58
+ def read_int; read_long; end
59
+
60
+ def read_long
61
+ # int and long values are written using variable-length,
62
+ # zig-zag coding.
63
+ b = byte!
64
+ n = b & 0x7F
65
+ shift = 7
66
+ while (b & 0x80) != 0
67
+ b = byte!
68
+ n |= (b & 0x7F) << shift
69
+ shift += 7
70
+ end
71
+ (n >> 1) ^ -(n & 1)
72
+ end
73
+
74
+ def read_float
75
+ # A float is written as 4 bytes.
76
+ # The float is converted into a 32-bit integer using a method
77
+ # equivalent to Java's floatToIntBits and then encoded in
78
+ # little-endian format.
79
+ @reader.read(4).unpack('e')[0]
80
+ end
81
+
82
+ def read_double
83
+ # A double is written as 8 bytes.
84
+ # The double is converted into a 64-bit integer using a method
85
+ # equivalent to Java's doubleToLongBits and then encoded in
86
+ # little-endian format.
87
+ @reader.read(8).unpack('E')[0]
88
+ end
89
+
90
+ def read_bytes
91
+ # Bytes are encoded as a long followed by that many bytes of
92
+ # data.
93
+ read(read_long)
94
+ end
95
+
96
+ def read_string
97
+ # A string is encoded as a long followed by that many bytes of
98
+ # UTF-8 encoded character data.
99
+ read_bytes.tap do |string|
100
+ string.force_encoding("UTF-8") if string.respond_to? :force_encoding
101
+ end
102
+ end
103
+
104
+ def read(len)
105
+ # Read n bytes
106
+ @reader.read(len)
107
+ end
108
+
109
+ def skip_null
110
+ nil
111
+ end
112
+
113
+ def skip_boolean
114
+ skip(1)
115
+ end
116
+
117
+ def skip_int
118
+ skip_long
119
+ end
120
+
121
+ def skip_long
122
+ b = byte!
123
+ while (b & 0x80) != 0
124
+ b = byte!
125
+ end
126
+ end
127
+
128
+ def skip_float
129
+ skip(4)
130
+ end
131
+
132
+ def skip_double
133
+ skip(8)
134
+ end
135
+
136
+ def skip_bytes
137
+ skip(read_long)
138
+ end
139
+
140
+ def skip_string
141
+ skip_bytes
142
+ end
143
+
144
+ def skip(n)
145
+ reader.seek(reader.tell() + n)
146
+ end
147
+ end
148
+
149
+ # Write leaf values
150
+ class BinaryEncoder
151
+ attr_reader :writer
152
+
153
+ def initialize(writer)
154
+ @writer = writer
155
+ end
156
+
157
+ # null is written as zero bytes
158
+ def write_null(datum)
159
+ nil
160
+ end
161
+
162
+ # a boolean is written as a single byte
163
+ # whose value is either 0 (false) or 1 (true).
164
+ def write_boolean(datum)
165
+ on_disk = datum ? 1.chr : 0.chr
166
+ writer.write(on_disk)
167
+ end
168
+
169
+ # int and long values are written using variable-length,
170
+ # zig-zag coding.
171
+ def write_int(n)
172
+ write_long(n)
173
+ end
174
+
175
+ # int and long values are written using variable-length,
176
+ # zig-zag coding.
177
+ def write_long(n)
178
+ foo = n
179
+ n = (n << 1) ^ (n >> 63)
180
+ while (n & ~0x7F) != 0
181
+ @writer.write(((n & 0x7f) | 0x80).chr)
182
+ n >>= 7
183
+ end
184
+ @writer.write(n.chr)
185
+ end
186
+
187
+ # A float is written as 4 bytes.
188
+ # The float is converted into a 32-bit integer using a method
189
+ # equivalent to Java's floatToIntBits and then encoded in
190
+ # little-endian format.
191
+ def write_float(datum)
192
+ @writer.write([datum].pack('e'))
193
+ end
194
+
195
+ # A double is written as 8 bytes.
196
+ # The double is converted into a 64-bit integer using a method
197
+ # equivalent to Java's doubleToLongBits and then encoded in
198
+ # little-endian format.
199
+ def write_double(datum)
200
+ @writer.write([datum].pack('E'))
201
+ end
202
+
203
+ # Bytes are encoded as a long followed by that many bytes of data.
204
+ def write_bytes(datum)
205
+ write_long(datum.bytesize)
206
+ @writer.write(datum)
207
+ end
208
+
209
+ # A string is encoded as a long followed by that many bytes of
210
+ # UTF-8 encoded character data
211
+ def write_string(datum)
212
+ datum = datum.encode('utf-8') if datum.respond_to? :encode
213
+ write_bytes(datum)
214
+ end
215
+
216
+ # Write an arbritary datum.
217
+ def write(datum)
218
+ writer.write(datum)
219
+ end
220
+ end
221
+
222
+ class DatumReader
223
+ def self.match_schemas(writers_schema, readers_schema)
224
+ w_type = writers_schema.type_sym
225
+ r_type = readers_schema.type_sym
226
+
227
+ # This conditional is begging for some OO love.
228
+ if w_type == :union || r_type == :union
229
+ return true
230
+ end
231
+
232
+ if w_type == r_type
233
+ return true if Schema::PRIMITIVE_TYPES_SYM.include?(r_type)
234
+
235
+ case r_type
236
+ when :record
237
+ return writers_schema.fullname == readers_schema.fullname
238
+ when :error
239
+ return writers_schema.fullname == readers_schema.fullname
240
+ when :request
241
+ return true
242
+ when :fixed
243
+ return writers_schema.fullname == readers_schema.fullname &&
244
+ writers_schema.size == readers_schema.size
245
+ when :enum
246
+ return writers_schema.fullname == readers_schema.fullname
247
+ when :map
248
+ return writers_schema.values.type == readers_schema.values.type
249
+ when :array
250
+ return writers_schema.items.type == readers_schema.items.type
251
+ end
252
+ end
253
+
254
+ # Handle schema promotion
255
+ if w_type == :int && [:long, :float, :double].include?(r_type)
256
+ return true
257
+ elsif w_type == :long && [:float, :double].include?(r_type)
258
+ return true
259
+ elsif w_type == :float && r_type == :double
260
+ return true
261
+ end
262
+
263
+ return false
264
+ end
265
+
266
+ attr_accessor :writers_schema, :readers_schema
267
+
268
+ def initialize(writers_schema=nil, readers_schema=nil)
269
+ @writers_schema = writers_schema
270
+ @readers_schema = readers_schema
271
+ end
272
+
273
+ def read(decoder)
274
+ self.readers_schema = writers_schema unless readers_schema
275
+ read_data(writers_schema, readers_schema, decoder)
276
+ end
277
+
278
+ def read_data(writers_schema, readers_schema, decoder)
279
+ # schema matching
280
+ unless self.class.match_schemas(writers_schema, readers_schema)
281
+ raise SchemaMatchException.new(writers_schema, readers_schema)
282
+ end
283
+
284
+ # schema resolution: reader's schema is a union, writer's
285
+ # schema is not
286
+ if writers_schema.type_sym != :union && readers_schema.type_sym == :union
287
+ rs = readers_schema.schemas.find{|s|
288
+ self.class.match_schemas(writers_schema, s)
289
+ }
290
+ return read_data(writers_schema, rs, decoder) if rs
291
+ raise SchemaMatchException.new(writers_schema, readers_schema)
292
+ end
293
+
294
+ # function dispatch for reading data based on type of writer's
295
+ # schema
296
+ datum = case writers_schema.type_sym
297
+ when :null; decoder.read_null
298
+ when :boolean; decoder.read_boolean
299
+ when :string; decoder.read_string
300
+ when :int; decoder.read_int
301
+ when :long; decoder.read_long
302
+ when :float; decoder.read_float
303
+ when :double; decoder.read_double
304
+ when :bytes; decoder.read_bytes
305
+ when :fixed; read_fixed(writers_schema, readers_schema, decoder)
306
+ when :enum; read_enum(writers_schema, readers_schema, decoder)
307
+ when :array; read_array(writers_schema, readers_schema, decoder)
308
+ when :map; read_map(writers_schema, readers_schema, decoder)
309
+ when :union; read_union(writers_schema, readers_schema, decoder)
310
+ when :record, :error, :request; read_record(writers_schema, readers_schema, decoder)
311
+ else
312
+ raise AvroError, "Cannot read unknown schema type: #{writers_schema.type}"
313
+ end
314
+
315
+ readers_schema.type_adapter.decode(datum)
316
+ end
317
+
318
+ def read_fixed(writers_schema, readers_schema, decoder)
319
+ decoder.read(writers_schema.size)
320
+ end
321
+
322
+ def read_enum(writers_schema, readers_schema, decoder)
323
+ index_of_symbol = decoder.read_int
324
+ read_symbol = writers_schema.symbols[index_of_symbol]
325
+
326
+ # TODO(jmhodges): figure out what unset means for resolution
327
+ # schema resolution
328
+ unless readers_schema.symbols.include?(read_symbol)
329
+ # 'unset' here
330
+ end
331
+
332
+ read_symbol
333
+ end
334
+
335
+ def read_array(writers_schema, readers_schema, decoder)
336
+ read_items = []
337
+ block_count = decoder.read_long
338
+ while block_count != 0
339
+ if block_count < 0
340
+ block_count = -block_count
341
+ block_size = decoder.read_long
342
+ end
343
+ block_count.times do
344
+ read_items << read_data(writers_schema.items,
345
+ readers_schema.items,
346
+ decoder)
347
+ end
348
+ block_count = decoder.read_long
349
+ end
350
+
351
+ read_items
352
+ end
353
+
354
+ def read_map(writers_schema, readers_schema, decoder)
355
+ read_items = {}
356
+ block_count = decoder.read_long
357
+ while block_count != 0
358
+ if block_count < 0
359
+ block_count = -block_count
360
+ block_size = decoder.read_long
361
+ end
362
+ block_count.times do
363
+ key = decoder.read_string
364
+ read_items[key] = read_data(writers_schema.values,
365
+ readers_schema.values,
366
+ decoder)
367
+ end
368
+ block_count = decoder.read_long
369
+ end
370
+
371
+ read_items
372
+ end
373
+
374
+ def read_union(writers_schema, readers_schema, decoder)
375
+ index_of_schema = decoder.read_long
376
+ selected_writers_schema = writers_schema.schemas[index_of_schema]
377
+
378
+ read_data(selected_writers_schema, readers_schema, decoder)
379
+ end
380
+
381
+ def read_record(writers_schema, readers_schema, decoder)
382
+ readers_fields_hash = readers_schema.fields_hash
383
+ read_record = {}
384
+ writers_schema.fields.each do |field|
385
+ if readers_field = readers_fields_hash[field.name]
386
+ field_val = read_data(field.type, readers_field.type, decoder)
387
+ read_record[field.name] = field_val
388
+ else
389
+ skip_data(field.type, decoder)
390
+ end
391
+ end
392
+
393
+ # fill in the default values
394
+ if readers_fields_hash.size > read_record.size
395
+ writers_fields_hash = writers_schema.fields_hash
396
+ readers_fields_hash.each do |field_name, field|
397
+ unless writers_fields_hash.has_key? field_name
398
+ if !field.default.nil?
399
+ field_val = read_default_value(field.type, field.default)
400
+ read_record[field.name] = field_val
401
+ else
402
+ # FIXME(jmhodges) another 'unset' here
403
+ end
404
+ end
405
+ end
406
+ end
407
+
408
+ read_record
409
+ end
410
+
411
+ def read_default_value(field_schema, default_value)
412
+ # Basically a JSON Decoder?
413
+ case field_schema.type_sym
414
+ when :null
415
+ return nil
416
+ when :boolean
417
+ return default_value
418
+ when :int, :long
419
+ return Integer(default_value)
420
+ when :float, :double
421
+ return Float(default_value)
422
+ when :enum, :fixed, :string, :bytes
423
+ return default_value
424
+ when :array
425
+ read_array = []
426
+ default_value.each do |json_val|
427
+ item_val = read_default_value(field_schema.items, json_val)
428
+ read_array << item_val
429
+ end
430
+ return read_array
431
+ when :map
432
+ read_map = {}
433
+ default_value.each do |key, json_val|
434
+ map_val = read_default_value(field_schema.values, json_val)
435
+ read_map[key] = map_val
436
+ end
437
+ return read_map
438
+ when :union
439
+ return read_default_value(field_schema.schemas[0], default_value)
440
+ when :record, :error
441
+ read_record = {}
442
+ field_schema.fields.each do |field|
443
+ json_val = default_value[field.name]
444
+ json_val = field.default unless json_val
445
+ field_val = read_default_value(field.type, json_val)
446
+ read_record[field.name] = field_val
447
+ end
448
+ return read_record
449
+ else
450
+ fail_msg = "Unknown type: #{field_schema.type}"
451
+ raise AvroError, fail_msg
452
+ end
453
+ end
454
+
455
+ def skip_data(writers_schema, decoder)
456
+ case writers_schema.type_sym
457
+ when :null
458
+ decoder.skip_null
459
+ when :boolean
460
+ decoder.skip_boolean
461
+ when :string
462
+ decoder.skip_string
463
+ when :int
464
+ decoder.skip_int
465
+ when :long
466
+ decoder.skip_long
467
+ when :float
468
+ decoder.skip_float
469
+ when :double
470
+ decoder.skip_double
471
+ when :bytes
472
+ decoder.skip_bytes
473
+ when :fixed
474
+ skip_fixed(writers_schema, decoder)
475
+ when :enum
476
+ skip_enum(writers_schema, decoder)
477
+ when :array
478
+ skip_array(writers_schema, decoder)
479
+ when :map
480
+ skip_map(writers_schema, decoder)
481
+ when :union
482
+ skip_union(writers_schema, decoder)
483
+ when :record, :error, :request
484
+ skip_record(writers_schema, decoder)
485
+ else
486
+ raise AvroError, "Unknown schema type: #{writers_schema.type}"
487
+ end
488
+ end
489
+
490
+ def skip_fixed(writers_schema, decoder)
491
+ decoder.skip(writers_schema.size)
492
+ end
493
+
494
+ def skip_enum(writers_schema, decoder)
495
+ decoder.skip_int
496
+ end
497
+
498
+ def skip_union(writers_schema, decoder)
499
+ index = decoder.read_long
500
+ skip_data(writers_schema.schemas[index], decoder)
501
+ end
502
+
503
+ def skip_array(writers_schema, decoder)
504
+ skip_blocks(decoder) { skip_data(writers_schema.items, decoder) }
505
+ end
506
+
507
+ def skip_map(writers_schema, decoder)
508
+ skip_blocks(decoder) {
509
+ decoder.skip_string
510
+ skip_data(writers_schema.values, decoder)
511
+ }
512
+ end
513
+
514
+ def skip_record(writers_schema, decoder)
515
+ writers_schema.fields.each{|f| skip_data(f.type, decoder) }
516
+ end
517
+
518
+ private
519
+ def skip_blocks(decoder, &blk)
520
+ block_count = decoder.read_long
521
+ while block_count != 0
522
+ if block_count < 0
523
+ decoder.skip(decoder.read_long)
524
+ else
525
+ block_count.times &blk
526
+ end
527
+ block_count = decoder.read_long
528
+ end
529
+ end
530
+ end # DatumReader
531
+
532
+ # DatumWriter for generic ruby objects
533
+ class DatumWriter
534
+ attr_accessor :writers_schema
535
+ def initialize(writers_schema=nil)
536
+ @writers_schema = writers_schema
537
+ end
538
+
539
+ def write(datum, encoder)
540
+ write_data(writers_schema, datum, encoder)
541
+ end
542
+
543
+ def write_data(writers_schema, logical_datum, encoder)
544
+ datum = writers_schema.type_adapter.encode(logical_datum)
545
+
546
+ unless Schema.validate(writers_schema, datum)
547
+ raise AvroTypeError.new(writers_schema, datum)
548
+ end
549
+
550
+ # function dispatch to write datum
551
+ case writers_schema.type_sym
552
+ when :null; encoder.write_null(datum)
553
+ when :boolean; encoder.write_boolean(datum)
554
+ when :string; encoder.write_string(datum)
555
+ when :int; encoder.write_int(datum)
556
+ when :long; encoder.write_long(datum)
557
+ when :float; encoder.write_float(datum)
558
+ when :double; encoder.write_double(datum)
559
+ when :bytes; encoder.write_bytes(datum)
560
+ when :fixed; write_fixed(writers_schema, datum, encoder)
561
+ when :enum; write_enum(writers_schema, datum, encoder)
562
+ when :array; write_array(writers_schema, datum, encoder)
563
+ when :map; write_map(writers_schema, datum, encoder)
564
+ when :union; write_union(writers_schema, datum, encoder)
565
+ when :record, :error, :request; write_record(writers_schema, datum, encoder)
566
+ else
567
+ raise AvroError.new("Unknown type: #{writers_schema.type}")
568
+ end
569
+ end
570
+
571
+ def write_fixed(writers_schema, datum, encoder)
572
+ encoder.write(datum)
573
+ end
574
+
575
+ def write_enum(writers_schema, datum, encoder)
576
+ index_of_datum = writers_schema.symbols.index(datum)
577
+ encoder.write_int(index_of_datum)
578
+ end
579
+
580
+ def write_array(writers_schema, datum, encoder)
581
+ if datum.size > 0
582
+ encoder.write_long(datum.size)
583
+ datum.each do |item|
584
+ write_data(writers_schema.items, item, encoder)
585
+ end
586
+ end
587
+ encoder.write_long(0)
588
+ end
589
+
590
+ def write_map(writers_schema, datum, encoder)
591
+ if datum.size > 0
592
+ encoder.write_long(datum.size)
593
+ datum.each do |k,v|
594
+ encoder.write_string(k)
595
+ write_data(writers_schema.values, v, encoder)
596
+ end
597
+ end
598
+ encoder.write_long(0)
599
+ end
600
+
601
+ def write_union(writers_schema, datum, encoder)
602
+ index_of_schema = -1
603
+ found = writers_schema.schemas.
604
+ find{|e| index_of_schema += 1; found = Schema.validate(e, datum) }
605
+ unless found # Because find_index doesn't exist in 1.8.6
606
+ raise AvroTypeError.new(writers_schema, datum)
607
+ end
608
+ encoder.write_long(index_of_schema)
609
+ write_data(writers_schema.schemas[index_of_schema], datum, encoder)
610
+ end
611
+
612
+ def write_record(writers_schema, datum, encoder)
613
+ writers_schema.fields.each do |field|
614
+ write_data(field.type, datum[field.name], encoder)
615
+ end
616
+ end
617
+ end # DatumWriter
618
+ end
619
+ end