avro 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,572 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ module Avro
18
+ module IO
19
+ # Raised when datum is not an example of schema
20
+ class AvroTypeError < AvroError
21
+ def initialize(expected_schema, datum)
22
+ super("The datum #{datum.inspect} is not an example of schema #{expected_schema}")
23
+ end
24
+ end
25
+
26
+ # Raised when writer's and reader's schema do not match
27
+ class SchemaMatchException < AvroError
28
+ def initialize(writers_schema, readers_schema)
29
+ super("Writer's schema #{writers_schema} and Reader's schema " +
30
+ "#{readers_schema} do not match.")
31
+ end
32
+ end
33
+
34
+ # FIXME(jmhodges) move validate to this module?
35
+
36
+ class BinaryDecoder
37
+ # Read leaf values
38
+
39
+ # reader is an object on which we can call read, seek and tell.
40
+ attr_reader :reader
41
+ def initialize(reader)
42
+ @reader = reader
43
+ end
44
+
45
+ def byte!
46
+ @reader.read(1)[0]
47
+ end
48
+
49
+ def read_null
50
+ # null is written as zero byte's
51
+ nil
52
+ end
53
+
54
+ def read_boolean
55
+ byte! == 1
56
+ end
57
+
58
+ def read_int; read_long; end
59
+
60
+ def read_long
61
+ # int and long values are written using variable-length,
62
+ # zig-zag coding.
63
+ b = byte!
64
+ n = b & 0x7F
65
+ shift = 7
66
+ while (b & 0x80) != 0
67
+ b = byte!
68
+ n |= (b & 0x7F) << shift
69
+ shift += 7
70
+ end
71
+ (n >> 1) ^ -(n & 1)
72
+ end
73
+
74
+ def read_float
75
+ # A float is written as 4 bytes.
76
+ # The float is converted into a 32-bit integer using a method
77
+ # equivalent to Java's floatToIntBits and then encoded in
78
+ # little-endian format.
79
+
80
+ bits = (byte! & 0xFF) |
81
+ ((byte! & 0xff) << 8) |
82
+ ((byte! & 0xff) << 16) |
83
+ ((byte! & 0xff) << 24)
84
+ [bits].pack('i').unpack('e')[0]
85
+ end
86
+
87
+ def read_double
88
+ # A double is written as 8 bytes.
89
+ # The double is converted into a 64-bit integer using a method
90
+ # equivalent to Java's doubleToLongBits and then encoded in
91
+ # little-endian format.
92
+
93
+ bits = (byte! & 0xFF) |
94
+ ((byte! & 0xff) << 8) |
95
+ ((byte! & 0xff) << 16) |
96
+ ((byte! & 0xff) << 24) |
97
+ ((byte! & 0xff) << 32) |
98
+ ((byte! & 0xff) << 40) |
99
+ ((byte! & 0xff) << 48) |
100
+ ((byte! & 0xff) << 56)
101
+ [bits].pack('Q').unpack('d')[0]
102
+ end
103
+
104
+ def read_bytes
105
+ # Bytes are encoded as a long followed by that many bytes of
106
+ # data.
107
+ read(read_long)
108
+ end
109
+
110
+ def read_string
111
+ # A string is encoded as a long followed by that many bytes of
112
+ # UTF-8 encoded character data.
113
+ # FIXME utf-8 encode this in 1.9
114
+ read_bytes
115
+ end
116
+
117
+ def read(len)
118
+ # Read n bytes
119
+ @reader.read(len)
120
+ end
121
+
122
+ def skip_null
123
+ nil
124
+ end
125
+
126
+ def skip_boolean
127
+ skip(1)
128
+ end
129
+
130
+ def skip_int
131
+ skip_long
132
+ end
133
+
134
+ def skip_long
135
+ b = byte!
136
+ while (b & 0x80) != 0
137
+ b = byte!
138
+ end
139
+ end
140
+
141
+ def skip_float
142
+ skip(4)
143
+ end
144
+
145
+ def skip_double
146
+ skip(8)
147
+ end
148
+
149
+ def skip_bytes
150
+ skip(read_long)
151
+ end
152
+
153
+ def skip_string
154
+ skip_bytes
155
+ end
156
+
157
+ def skip(n)
158
+ reader.seek(reader.tell() + n)
159
+ end
160
+ end
161
+
162
+ # Write leaf values
163
+ class BinaryEncoder
164
+ attr_reader :writer
165
+
166
+ def initialize(writer)
167
+ @writer = writer
168
+ end
169
+
170
+ # null is written as zero bytes
171
+ def write_null(datum)
172
+ nil
173
+ end
174
+
175
+ # a boolean is written as a single byte
176
+ # whose value is either 0 (false) or 1 (true).
177
+ def write_boolean(datum)
178
+ on_disk = datum ? 1.chr : 0.chr
179
+ writer.write(on_disk)
180
+ end
181
+
182
+ # int and long values are written using variable-length,
183
+ # zig-zag coding.
184
+ def write_int(n)
185
+ write_long(n)
186
+ end
187
+
188
+ # int and long values are written using variable-length,
189
+ # zig-zag coding.
190
+ def write_long(n)
191
+ foo = n
192
+ n = (n << 1) ^ (n >> 63)
193
+ while (n & ~0x7F) != 0
194
+ @writer.write(((n & 0x7f) | 0x80).chr)
195
+ n >>= 7
196
+ end
197
+ @writer.write(n.chr)
198
+ end
199
+
200
+ # A float is written as 4 bytes.
201
+ # The float is converted into a 32-bit integer using a method
202
+ # equivalent to Java's floatToIntBits and then encoded in
203
+ # little-endian format.
204
+ def write_float(datum)
205
+ bits = [datum].pack('e').unpack('i')[0]
206
+ @writer.write(((bits ) & 0xFF).chr)
207
+ @writer.write(((bits >> 8 ) & 0xFF).chr)
208
+ @writer.write(((bits >> 16) & 0xFF).chr)
209
+ @writer.write(((bits >> 24) & 0xFF).chr)
210
+ end
211
+
212
+ # A double is written as 8 bytes.
213
+ # The double is converted into a 64-bit integer using a method
214
+ # equivalent to Java's doubleToLongBits and then encoded in
215
+ # little-endian format.
216
+ def write_double(datum)
217
+ bits = [datum].pack('d').unpack('Q')[0]
218
+ @writer.write(((bits ) & 0xFF).chr)
219
+ @writer.write(((bits >> 8 ) & 0xFF).chr)
220
+ @writer.write(((bits >> 16) & 0xFF).chr)
221
+ @writer.write(((bits >> 24) & 0xFF).chr)
222
+ @writer.write(((bits >> 32) & 0xFF).chr)
223
+ @writer.write(((bits >> 40) & 0xFF).chr)
224
+ @writer.write(((bits >> 48) & 0xFF).chr)
225
+ @writer.write(((bits >> 56) & 0xFF).chr)
226
+ end
227
+
228
+ # Bytes are encoded as a long followed by that many bytes of data.
229
+ def write_bytes(datum)
230
+ write_long(datum.size)
231
+ @writer.write(datum)
232
+ end
233
+
234
+ # A string is encoded as a long followed by that many bytes of
235
+ # UTF-8 encoded character data
236
+ def write_string(datum)
237
+ # FIXME utf-8 encode this in 1.9
238
+ write_bytes(datum)
239
+ end
240
+
241
+ # Write an arbritary datum.
242
+ def write(datum)
243
+ writer.write(datum)
244
+ end
245
+ end
246
+
247
+ class DatumReader
248
+ def self.check_props(schema_one, schema_two, prop_list)
249
+ prop_list.all? do |prop|
250
+ schema_one.to_hash[prop] == schema_two.to_hash[prop]
251
+ end
252
+ end
253
+
254
+ def self.match_schemas(writers_schema, readers_schema)
255
+ w_type = writers_schema.type
256
+ r_type = readers_schema.type
257
+
258
+ # This conditional is begging for some OO love.
259
+ if [w_type, r_type].include? 'union'
260
+ return true
261
+ elsif Schema::PRIMITIVE_TYPES.include?(w_type) &&
262
+ Schema::PRIMITIVE_TYPES.include?(r_type) &&
263
+ w_type == r_type
264
+ return true
265
+ elsif (w_type == r_type) && (r_type == 'record') &&
266
+ check_props(writers_schema, readers_schema, ['fullname'])
267
+ return true
268
+ elsif w_type == r_type && r_type == 'error' && check_props(writers_scheam, readers_schema, ['fullname'])
269
+ return true
270
+ elsif w_type == r_type && r_type == 'request'
271
+ return true
272
+ elsif (w_type == r_type) && (r_type == 'fixed') &&
273
+ check_props(writers_schema, readers_schema, ['fullname', 'size'])
274
+ return true
275
+ elsif (w_type == r_type) && (r_type == 'enum') &&
276
+ check_props(writers_schema, readers_schema, ['fullname'])
277
+ return true
278
+ elsif (w_type == r_type) && (r_type == 'map') &&
279
+ check_props(writers_schema.values, readers_schema.values, ['type'])
280
+ return true
281
+ elsif (w_type == r_type) && (r_type == 'array') &&
282
+ check_props(writers_schema.items, readers_schema.items, ['type'])
283
+ return true
284
+ end
285
+
286
+ # Handle schema promotion
287
+ if w_type == 'int' && ['long', 'float', 'double'].include?(r_type)
288
+ return true
289
+ elsif w_type == 'long' && ['float', 'double'].include?(r_type)
290
+ return true
291
+ elsif w_type == 'float' && r_type == 'double'
292
+ return true
293
+ end
294
+
295
+ return false
296
+ end
297
+
298
+ attr_accessor :writers_schema, :readers_schema
299
+
300
+ def initialize(writers_schema=nil, readers_schema=nil)
301
+ @writers_schema = writers_schema
302
+ @readers_schema = readers_schema
303
+ end
304
+
305
+ def read(decoder)
306
+ self.readers_schema = writers_schema unless readers_schema
307
+ read_data(writers_schema, readers_schema, decoder)
308
+ end
309
+
310
+ def read_data(writers_schema, readers_schema, decoder)
311
+ # schema matching
312
+ unless self.class.match_schemas(writers_schema, readers_schema)
313
+ raise SchemaMatchException.new(writers_schema, readers_schema)
314
+ end
315
+
316
+ # schema resolution: reader's schema is a union, writer's
317
+ # schema is not
318
+ if writers_schema.type != 'union' && readers_schema.type == 'union'
319
+ rs = readers_schema.schemas.find{|s|
320
+ self.class.match_schemas(writers_schema, s)
321
+ }
322
+ return read_data(writers_schema, rs, decoder) if rs
323
+ raise SchemaMatchException.new(writers_schema, readers_schema)
324
+ end
325
+
326
+ # function dispatch for reading data based on type of writer's
327
+ # schema
328
+ case writers_schema.type
329
+ when 'null'; decoder.read_null
330
+ when 'boolean'; decoder.read_boolean
331
+ when 'string'; decoder.read_string
332
+ when 'int'; decoder.read_int
333
+ when 'long'; decoder.read_long
334
+ when 'float'; decoder.read_float
335
+ when 'double'; decoder.read_double
336
+ when 'bytes'; decoder.read_bytes
337
+ when 'fixed'; read_fixed(writers_schema, readers_schema, decoder)
338
+ when 'enum'; read_enum(writers_schema, readers_schema, decoder)
339
+ when 'array'; read_array(writers_schema, readers_schema, decoder)
340
+ when 'map'; read_map(writers_schema, readers_schema, decoder)
341
+ when 'union'; read_union(writers_schema, readers_schema, decoder)
342
+ when 'record', 'errors', 'request'; read_record(writers_schema, readers_schema, decoder)
343
+ else
344
+ raise AvroError, "Cannot read unknown schema type: #{writers_schema.type}"
345
+ end
346
+ end
347
+
348
+ def read_fixed(writers_schema, readers_schema, decoder)
349
+ decoder.read(writers_schema.size)
350
+ end
351
+
352
+ def read_enum(writers_schema, readers_schema, decoder)
353
+ index_of_symbol = decoder.read_int
354
+ read_symbol = writers_schema.symbols[index_of_symbol]
355
+
356
+ # TODO(jmhodges): figure out what unset means for resolution
357
+ # schema resolution
358
+ unless readers_schema.symbols.include?(read_symbol)
359
+ # 'unset' here
360
+ end
361
+
362
+ read_symbol
363
+ end
364
+
365
+ def read_array(writers_schema, readers_schema, decoder)
366
+ read_items = []
367
+ block_count = decoder.read_long
368
+ while block_count != 0
369
+ if block_count < 0
370
+ block_count = -block_count
371
+ block_size = decoder.read_long
372
+ end
373
+ block_count.times do
374
+ read_items << read_data(writers_schema.items,
375
+ readers_schema.items,
376
+ decoder)
377
+ end
378
+ block_count = decoder.read_long
379
+ end
380
+
381
+ read_items
382
+ end
383
+
384
+ def read_map(writers_schema, readers_schema, decoder)
385
+ read_items = {}
386
+ block_count = decoder.read_long
387
+ while block_count != 0
388
+ if block_count < 0
389
+ block_count = -block_count
390
+ block_size = decoder.read_long
391
+ end
392
+ block_count.times do
393
+ key = decoder.read_string
394
+ read_items[key] = read_data(writers_schema.values,
395
+ readers_schema.values,
396
+ decoder)
397
+ end
398
+ block_count = decoder.read_long
399
+ end
400
+
401
+ read_items
402
+ end
403
+
404
+ def read_union(writers_schema, readers_schema, decoder)
405
+ index_of_schema = decoder.read_long
406
+ selected_writers_schema = writers_schema.schemas[index_of_schema]
407
+
408
+ read_data(selected_writers_schema, readers_schema, decoder)
409
+ end
410
+
411
+ def read_record(writers_schema, readers_schema, decoder)
412
+ readers_fields_hash = readers_schema.fields_hash
413
+ read_record = {}
414
+ writers_schema.fields.each do |field|
415
+ if readers_field = readers_fields_hash[field.name]
416
+ field_val = read_data(field.type, readers_field.type, decoder)
417
+ read_record[field.name] = field_val
418
+ else
419
+ skip_data(field.type, decoder)
420
+ end
421
+ end
422
+
423
+ # fill in the default values
424
+ if readers_fields_hash.size > read_record.size
425
+ writers_fields_hash = writers_schema.fields_hash
426
+ readers_fields_hash.each do |field_name, field|
427
+
428
+ unless writers_fields_hash.has_key? field_name
429
+ if !field.default.nil?
430
+ field_val = read_default_value(field.type, field.default)
431
+ read_record[field.name] = field_val
432
+ else
433
+ # FIXME(jmhodges) another 'unset' here
434
+ end
435
+ end
436
+ end
437
+ end
438
+
439
+ read_record
440
+ end
441
+
442
+ def read_default_value(field_schema, default_value)
443
+ # Basically a JSON Decoder?
444
+ case field_schema.type
445
+ when 'null'
446
+ return nil
447
+ when 'boolean'
448
+ return default_value
449
+ when 'int', 'long'
450
+ return Integer(default_value)
451
+ when 'float', 'double'
452
+ return Float(default_value)
453
+ when 'enum', 'fixed', 'string', 'bytes'
454
+ return default_value
455
+ when 'array'
456
+ read_array = []
457
+ default_value.each do |json_val|
458
+ item_val = read_default_value(field_schema.items, json_val)
459
+ read_array << item_val
460
+ end
461
+ return read_array
462
+ when 'map'
463
+ read_map = {}
464
+ default_value.each do |key, json_val|
465
+ map_val = read_default_value(field_schema.values, json_val)
466
+ read_map[key] = map_val
467
+ end
468
+ return read_map
469
+ when 'union'
470
+ return read_default_value(field_schema.schemas[0], default_value)
471
+ when 'record'
472
+ read_record = {}
473
+ field_schema.fields.each do |field|
474
+ json_val = default_value[field.name]
475
+ json_val = field.default unless json_val
476
+ field_val = read_default_value(field.type, json_val)
477
+ read_record[field.name] = field_val
478
+ end
479
+ return read_record
480
+ else
481
+ fail_msg = "Unknown type: #{field_schema.type}"
482
+ raise AvroError(fail_msg)
483
+ end
484
+ end
485
+ end # DatumReader
486
+
487
+ # DatumWriter for generic ruby objects
488
+ class DatumWriter
489
+ attr_accessor :writers_schema
490
+ def initialize(writers_schema=nil)
491
+ @writers_schema = writers_schema
492
+ end
493
+
494
+ def write(datum, encoder)
495
+ write_data(writers_schema, datum, encoder)
496
+ end
497
+
498
+ def write_data(writers_schema, datum, encoder)
499
+ unless Schema.validate(writers_schema, datum)
500
+ raise AvroTypeError.new(writers_schema, datum)
501
+ end
502
+
503
+ # function dispatch to write datum
504
+ case writers_schema.type
505
+ when 'null'; encoder.write_null(datum)
506
+ when 'boolean'; encoder.write_boolean(datum)
507
+ when 'string'; encoder.write_string(datum)
508
+ when 'int'; encoder.write_int(datum)
509
+ when 'long'; encoder.write_long(datum)
510
+ when 'float'; encoder.write_float(datum)
511
+ when 'double'; encoder.write_double(datum)
512
+ when 'bytes'; encoder.write_bytes(datum)
513
+ when 'fixed'; write_fixed(writers_schema, datum, encoder)
514
+ when 'enum'; write_enum(writers_schema, datum, encoder)
515
+ when 'array'; write_array(writers_schema, datum, encoder)
516
+ when 'map'; write_map(writers_schema, datum, encoder)
517
+ when 'union'; write_union(writers_schema, datum, encoder)
518
+ when 'record', 'errors', 'request'; write_record(writers_schema, datum, encoder)
519
+ else
520
+ raise AvroError.new("Unknown type: #{writers_schema.type}")
521
+ end
522
+ end
523
+
524
+ def write_fixed(writers_schema, datum, encoder)
525
+ encoder.write(datum)
526
+ end
527
+
528
+ def write_enum(writers_schema, datum, encoder)
529
+ index_of_datum = writers_schema.symbols.index(datum)
530
+ encoder.write_int(index_of_datum)
531
+ end
532
+
533
+ def write_array(writers_schema, datum, encoder)
534
+ if datum.size > 0
535
+ encoder.write_long(datum.size)
536
+ datum.each do |item|
537
+ write_data(writers_schema.items, item, encoder)
538
+ end
539
+ end
540
+ encoder.write_long(0)
541
+ end
542
+
543
+ def write_map(writers_schema, datum, encoder)
544
+ if datum.size > 0
545
+ encoder.write_long(datum.size)
546
+ datum.each do |k,v|
547
+ encoder.write_string(k)
548
+ write_data(writers_schema.values, v, encoder)
549
+ end
550
+ end
551
+ encoder.write_long(0)
552
+ end
553
+
554
+ def write_union(writers_schema, datum, encoder)
555
+ index_of_schema = -1
556
+ found = writers_schema.schemas.
557
+ find{|e| index_of_schema += 1; found = Schema.validate(e, datum) }
558
+ unless found # Because find_index doesn't exist in 1.8.6
559
+ raise AvroTypeError.new(writers_schema, datum)
560
+ end
561
+ encoder.write_long(index_of_schema)
562
+ write_data(writers_schema.schemas[index_of_schema], datum, encoder)
563
+ end
564
+
565
+ def write_record(writers_schema, datum, encoder)
566
+ writers_schema.fields.each do |field|
567
+ write_data(field.type, datum[field.name], encoder)
568
+ end
569
+ end
570
+ end # DatumWriter
571
+ end
572
+ end