avro 1.3.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,572 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ module Avro
18
+ module IO
19
+ # Raised when datum is not an example of schema
20
+ class AvroTypeError < AvroError
21
+ def initialize(expected_schema, datum)
22
+ super("The datum #{datum.inspect} is not an example of schema #{expected_schema}")
23
+ end
24
+ end
25
+
26
+ # Raised when writer's and reader's schema do not match
27
+ class SchemaMatchException < AvroError
28
+ def initialize(writers_schema, readers_schema)
29
+ super("Writer's schema #{writers_schema} and Reader's schema " +
30
+ "#{readers_schema} do not match.")
31
+ end
32
+ end
33
+
34
+ # FIXME(jmhodges) move validate to this module?
35
+
36
+ class BinaryDecoder
37
+ # Read leaf values
38
+
39
+ # reader is an object on which we can call read, seek and tell.
40
+ attr_reader :reader
41
+ def initialize(reader)
42
+ @reader = reader
43
+ end
44
+
45
+ def byte!
46
+ @reader.read(1)[0]
47
+ end
48
+
49
+ def read_null
50
+ # null is written as zero byte's
51
+ nil
52
+ end
53
+
54
+ def read_boolean
55
+ byte! == 1
56
+ end
57
+
58
+ def read_int; read_long; end
59
+
60
+ def read_long
61
+ # int and long values are written using variable-length,
62
+ # zig-zag coding.
63
+ b = byte!
64
+ n = b & 0x7F
65
+ shift = 7
66
+ while (b & 0x80) != 0
67
+ b = byte!
68
+ n |= (b & 0x7F) << shift
69
+ shift += 7
70
+ end
71
+ (n >> 1) ^ -(n & 1)
72
+ end
73
+
74
+ def read_float
75
+ # A float is written as 4 bytes.
76
+ # The float is converted into a 32-bit integer using a method
77
+ # equivalent to Java's floatToIntBits and then encoded in
78
+ # little-endian format.
79
+
80
+ bits = (byte! & 0xFF) |
81
+ ((byte! & 0xff) << 8) |
82
+ ((byte! & 0xff) << 16) |
83
+ ((byte! & 0xff) << 24)
84
+ [bits].pack('i').unpack('e')[0]
85
+ end
86
+
87
+ def read_double
88
+ # A double is written as 8 bytes.
89
+ # The double is converted into a 64-bit integer using a method
90
+ # equivalent to Java's doubleToLongBits and then encoded in
91
+ # little-endian format.
92
+
93
+ bits = (byte! & 0xFF) |
94
+ ((byte! & 0xff) << 8) |
95
+ ((byte! & 0xff) << 16) |
96
+ ((byte! & 0xff) << 24) |
97
+ ((byte! & 0xff) << 32) |
98
+ ((byte! & 0xff) << 40) |
99
+ ((byte! & 0xff) << 48) |
100
+ ((byte! & 0xff) << 56)
101
+ [bits].pack('Q').unpack('d')[0]
102
+ end
103
+
104
+ def read_bytes
105
+ # Bytes are encoded as a long followed by that many bytes of
106
+ # data.
107
+ read(read_long)
108
+ end
109
+
110
+ def read_string
111
+ # A string is encoded as a long followed by that many bytes of
112
+ # UTF-8 encoded character data.
113
+ # FIXME utf-8 encode this in 1.9
114
+ read_bytes
115
+ end
116
+
117
+ def read(len)
118
+ # Read n bytes
119
+ @reader.read(len)
120
+ end
121
+
122
+ def skip_null
123
+ nil
124
+ end
125
+
126
+ def skip_boolean
127
+ skip(1)
128
+ end
129
+
130
+ def skip_int
131
+ skip_long
132
+ end
133
+
134
+ def skip_long
135
+ b = byte!
136
+ while (b & 0x80) != 0
137
+ b = byte!
138
+ end
139
+ end
140
+
141
+ def skip_float
142
+ skip(4)
143
+ end
144
+
145
+ def skip_double
146
+ skip(8)
147
+ end
148
+
149
+ def skip_bytes
150
+ skip(read_long)
151
+ end
152
+
153
+ def skip_string
154
+ skip_bytes
155
+ end
156
+
157
+ def skip(n)
158
+ reader.seek(reader.tell() + n)
159
+ end
160
+ end
161
+
162
+ # Write leaf values
163
+ class BinaryEncoder
164
+ attr_reader :writer
165
+
166
+ def initialize(writer)
167
+ @writer = writer
168
+ end
169
+
170
+ # null is written as zero bytes
171
+ def write_null(datum)
172
+ nil
173
+ end
174
+
175
+ # a boolean is written as a single byte
176
+ # whose value is either 0 (false) or 1 (true).
177
+ def write_boolean(datum)
178
+ on_disk = datum ? 1.chr : 0.chr
179
+ writer.write(on_disk)
180
+ end
181
+
182
+ # int and long values are written using variable-length,
183
+ # zig-zag coding.
184
+ def write_int(n)
185
+ write_long(n)
186
+ end
187
+
188
+ # int and long values are written using variable-length,
189
+ # zig-zag coding.
190
+ def write_long(n)
191
+ foo = n
192
+ n = (n << 1) ^ (n >> 63)
193
+ while (n & ~0x7F) != 0
194
+ @writer.write(((n & 0x7f) | 0x80).chr)
195
+ n >>= 7
196
+ end
197
+ @writer.write(n.chr)
198
+ end
199
+
200
+ # A float is written as 4 bytes.
201
+ # The float is converted into a 32-bit integer using a method
202
+ # equivalent to Java's floatToIntBits and then encoded in
203
+ # little-endian format.
204
+ def write_float(datum)
205
+ bits = [datum].pack('e').unpack('i')[0]
206
+ @writer.write(((bits ) & 0xFF).chr)
207
+ @writer.write(((bits >> 8 ) & 0xFF).chr)
208
+ @writer.write(((bits >> 16) & 0xFF).chr)
209
+ @writer.write(((bits >> 24) & 0xFF).chr)
210
+ end
211
+
212
+ # A double is written as 8 bytes.
213
+ # The double is converted into a 64-bit integer using a method
214
+ # equivalent to Java's doubleToLongBits and then encoded in
215
+ # little-endian format.
216
+ def write_double(datum)
217
+ bits = [datum].pack('d').unpack('Q')[0]
218
+ @writer.write(((bits ) & 0xFF).chr)
219
+ @writer.write(((bits >> 8 ) & 0xFF).chr)
220
+ @writer.write(((bits >> 16) & 0xFF).chr)
221
+ @writer.write(((bits >> 24) & 0xFF).chr)
222
+ @writer.write(((bits >> 32) & 0xFF).chr)
223
+ @writer.write(((bits >> 40) & 0xFF).chr)
224
+ @writer.write(((bits >> 48) & 0xFF).chr)
225
+ @writer.write(((bits >> 56) & 0xFF).chr)
226
+ end
227
+
228
+ # Bytes are encoded as a long followed by that many bytes of data.
229
+ def write_bytes(datum)
230
+ write_long(datum.size)
231
+ @writer.write(datum)
232
+ end
233
+
234
+ # A string is encoded as a long followed by that many bytes of
235
+ # UTF-8 encoded character data
236
+ def write_string(datum)
237
+ # FIXME utf-8 encode this in 1.9
238
+ write_bytes(datum)
239
+ end
240
+
241
+ # Write an arbritary datum.
242
+ def write(datum)
243
+ writer.write(datum)
244
+ end
245
+ end
246
+
247
+ class DatumReader
248
+ def self.check_props(schema_one, schema_two, prop_list)
249
+ prop_list.all? do |prop|
250
+ schema_one.to_hash[prop] == schema_two.to_hash[prop]
251
+ end
252
+ end
253
+
254
+ def self.match_schemas(writers_schema, readers_schema)
255
+ w_type = writers_schema.type
256
+ r_type = readers_schema.type
257
+
258
+ # This conditional is begging for some OO love.
259
+ if [w_type, r_type].include? 'union'
260
+ return true
261
+ elsif Schema::PRIMITIVE_TYPES.include?(w_type) &&
262
+ Schema::PRIMITIVE_TYPES.include?(r_type) &&
263
+ w_type == r_type
264
+ return true
265
+ elsif (w_type == r_type) && (r_type == 'record') &&
266
+ check_props(writers_schema, readers_schema, ['fullname'])
267
+ return true
268
+ elsif w_type == r_type && r_type == 'error' && check_props(writers_scheam, readers_schema, ['fullname'])
269
+ return true
270
+ elsif w_type == r_type && r_type == 'request'
271
+ return true
272
+ elsif (w_type == r_type) && (r_type == 'fixed') &&
273
+ check_props(writers_schema, readers_schema, ['fullname', 'size'])
274
+ return true
275
+ elsif (w_type == r_type) && (r_type == 'enum') &&
276
+ check_props(writers_schema, readers_schema, ['fullname'])
277
+ return true
278
+ elsif (w_type == r_type) && (r_type == 'map') &&
279
+ check_props(writers_schema.values, readers_schema.values, ['type'])
280
+ return true
281
+ elsif (w_type == r_type) && (r_type == 'array') &&
282
+ check_props(writers_schema.items, readers_schema.items, ['type'])
283
+ return true
284
+ end
285
+
286
+ # Handle schema promotion
287
+ if w_type == 'int' && ['long', 'float', 'double'].include?(r_type)
288
+ return true
289
+ elsif w_type == 'long' && ['float', 'double'].include?(r_type)
290
+ return true
291
+ elsif w_type == 'float' && r_type == 'double'
292
+ return true
293
+ end
294
+
295
+ return false
296
+ end
297
+
298
+ attr_accessor :writers_schema, :readers_schema
299
+
300
+ def initialize(writers_schema=nil, readers_schema=nil)
301
+ @writers_schema = writers_schema
302
+ @readers_schema = readers_schema
303
+ end
304
+
305
+ def read(decoder)
306
+ self.readers_schema = writers_schema unless readers_schema
307
+ read_data(writers_schema, readers_schema, decoder)
308
+ end
309
+
310
+ def read_data(writers_schema, readers_schema, decoder)
311
+ # schema matching
312
+ unless self.class.match_schemas(writers_schema, readers_schema)
313
+ raise SchemaMatchException.new(writers_schema, readers_schema)
314
+ end
315
+
316
+ # schema resolution: reader's schema is a union, writer's
317
+ # schema is not
318
+ if writers_schema.type != 'union' && readers_schema.type == 'union'
319
+ rs = readers_schema.schemas.find{|s|
320
+ self.class.match_schemas(writers_schema, s)
321
+ }
322
+ return read_data(writers_schema, rs, decoder) if rs
323
+ raise SchemaMatchException.new(writers_schema, readers_schema)
324
+ end
325
+
326
+ # function dispatch for reading data based on type of writer's
327
+ # schema
328
+ case writers_schema.type
329
+ when 'null'; decoder.read_null
330
+ when 'boolean'; decoder.read_boolean
331
+ when 'string'; decoder.read_string
332
+ when 'int'; decoder.read_int
333
+ when 'long'; decoder.read_long
334
+ when 'float'; decoder.read_float
335
+ when 'double'; decoder.read_double
336
+ when 'bytes'; decoder.read_bytes
337
+ when 'fixed'; read_fixed(writers_schema, readers_schema, decoder)
338
+ when 'enum'; read_enum(writers_schema, readers_schema, decoder)
339
+ when 'array'; read_array(writers_schema, readers_schema, decoder)
340
+ when 'map'; read_map(writers_schema, readers_schema, decoder)
341
+ when 'union'; read_union(writers_schema, readers_schema, decoder)
342
+ when 'record', 'errors', 'request'; read_record(writers_schema, readers_schema, decoder)
343
+ else
344
+ raise AvroError, "Cannot read unknown schema type: #{writers_schema.type}"
345
+ end
346
+ end
347
+
348
+ def read_fixed(writers_schema, readers_schema, decoder)
349
+ decoder.read(writers_schema.size)
350
+ end
351
+
352
+ def read_enum(writers_schema, readers_schema, decoder)
353
+ index_of_symbol = decoder.read_int
354
+ read_symbol = writers_schema.symbols[index_of_symbol]
355
+
356
+ # TODO(jmhodges): figure out what unset means for resolution
357
+ # schema resolution
358
+ unless readers_schema.symbols.include?(read_symbol)
359
+ # 'unset' here
360
+ end
361
+
362
+ read_symbol
363
+ end
364
+
365
+ def read_array(writers_schema, readers_schema, decoder)
366
+ read_items = []
367
+ block_count = decoder.read_long
368
+ while block_count != 0
369
+ if block_count < 0
370
+ block_count = -block_count
371
+ block_size = decoder.read_long
372
+ end
373
+ block_count.times do
374
+ read_items << read_data(writers_schema.items,
375
+ readers_schema.items,
376
+ decoder)
377
+ end
378
+ block_count = decoder.read_long
379
+ end
380
+
381
+ read_items
382
+ end
383
+
384
+ def read_map(writers_schema, readers_schema, decoder)
385
+ read_items = {}
386
+ block_count = decoder.read_long
387
+ while block_count != 0
388
+ if block_count < 0
389
+ block_count = -block_count
390
+ block_size = decoder.read_long
391
+ end
392
+ block_count.times do
393
+ key = decoder.read_string
394
+ read_items[key] = read_data(writers_schema.values,
395
+ readers_schema.values,
396
+ decoder)
397
+ end
398
+ block_count = decoder.read_long
399
+ end
400
+
401
+ read_items
402
+ end
403
+
404
+ def read_union(writers_schema, readers_schema, decoder)
405
+ index_of_schema = decoder.read_long
406
+ selected_writers_schema = writers_schema.schemas[index_of_schema]
407
+
408
+ read_data(selected_writers_schema, readers_schema, decoder)
409
+ end
410
+
411
+ def read_record(writers_schema, readers_schema, decoder)
412
+ readers_fields_hash = readers_schema.fields_hash
413
+ read_record = {}
414
+ writers_schema.fields.each do |field|
415
+ if readers_field = readers_fields_hash[field.name]
416
+ field_val = read_data(field.type, readers_field.type, decoder)
417
+ read_record[field.name] = field_val
418
+ else
419
+ skip_data(field.type, decoder)
420
+ end
421
+ end
422
+
423
+ # fill in the default values
424
+ if readers_fields_hash.size > read_record.size
425
+ writers_fields_hash = writers_schema.fields_hash
426
+ readers_fields_hash.each do |field_name, field|
427
+
428
+ unless writers_fields_hash.has_key? field_name
429
+ if !field.default.nil?
430
+ field_val = read_default_value(field.type, field.default)
431
+ read_record[field.name] = field_val
432
+ else
433
+ # FIXME(jmhodges) another 'unset' here
434
+ end
435
+ end
436
+ end
437
+ end
438
+
439
+ read_record
440
+ end
441
+
442
+ def read_default_value(field_schema, default_value)
443
+ # Basically a JSON Decoder?
444
+ case field_schema.type
445
+ when 'null'
446
+ return nil
447
+ when 'boolean'
448
+ return default_value
449
+ when 'int', 'long'
450
+ return Integer(default_value)
451
+ when 'float', 'double'
452
+ return Float(default_value)
453
+ when 'enum', 'fixed', 'string', 'bytes'
454
+ return default_value
455
+ when 'array'
456
+ read_array = []
457
+ default_value.each do |json_val|
458
+ item_val = read_default_value(field_schema.items, json_val)
459
+ read_array << item_val
460
+ end
461
+ return read_array
462
+ when 'map'
463
+ read_map = {}
464
+ default_value.each do |key, json_val|
465
+ map_val = read_default_value(field_schema.values, json_val)
466
+ read_map[key] = map_val
467
+ end
468
+ return read_map
469
+ when 'union'
470
+ return read_default_value(field_schema.schemas[0], default_value)
471
+ when 'record'
472
+ read_record = {}
473
+ field_schema.fields.each do |field|
474
+ json_val = default_value[field.name]
475
+ json_val = field.default unless json_val
476
+ field_val = read_default_value(field.type, json_val)
477
+ read_record[field.name] = field_val
478
+ end
479
+ return read_record
480
+ else
481
+ fail_msg = "Unknown type: #{field_schema.type}"
482
+ raise AvroError(fail_msg)
483
+ end
484
+ end
485
+ end # DatumReader
486
+
487
+ # DatumWriter for generic ruby objects
488
+ class DatumWriter
489
+ attr_accessor :writers_schema
490
+ def initialize(writers_schema=nil)
491
+ @writers_schema = writers_schema
492
+ end
493
+
494
+ def write(datum, encoder)
495
+ write_data(writers_schema, datum, encoder)
496
+ end
497
+
498
+ def write_data(writers_schema, datum, encoder)
499
+ unless Schema.validate(writers_schema, datum)
500
+ raise AvroTypeError.new(writers_schema, datum)
501
+ end
502
+
503
+ # function dispatch to write datum
504
+ case writers_schema.type
505
+ when 'null'; encoder.write_null(datum)
506
+ when 'boolean'; encoder.write_boolean(datum)
507
+ when 'string'; encoder.write_string(datum)
508
+ when 'int'; encoder.write_int(datum)
509
+ when 'long'; encoder.write_long(datum)
510
+ when 'float'; encoder.write_float(datum)
511
+ when 'double'; encoder.write_double(datum)
512
+ when 'bytes'; encoder.write_bytes(datum)
513
+ when 'fixed'; write_fixed(writers_schema, datum, encoder)
514
+ when 'enum'; write_enum(writers_schema, datum, encoder)
515
+ when 'array'; write_array(writers_schema, datum, encoder)
516
+ when 'map'; write_map(writers_schema, datum, encoder)
517
+ when 'union'; write_union(writers_schema, datum, encoder)
518
+ when 'record', 'errors', 'request'; write_record(writers_schema, datum, encoder)
519
+ else
520
+ raise AvroError.new("Unknown type: #{writers_schema.type}")
521
+ end
522
+ end
523
+
524
+ def write_fixed(writers_schema, datum, encoder)
525
+ encoder.write(datum)
526
+ end
527
+
528
+ def write_enum(writers_schema, datum, encoder)
529
+ index_of_datum = writers_schema.symbols.index(datum)
530
+ encoder.write_int(index_of_datum)
531
+ end
532
+
533
+ def write_array(writers_schema, datum, encoder)
534
+ if datum.size > 0
535
+ encoder.write_long(datum.size)
536
+ datum.each do |item|
537
+ write_data(writers_schema.items, item, encoder)
538
+ end
539
+ end
540
+ encoder.write_long(0)
541
+ end
542
+
543
+ def write_map(writers_schema, datum, encoder)
544
+ if datum.size > 0
545
+ encoder.write_long(datum.size)
546
+ datum.each do |k,v|
547
+ encoder.write_string(k)
548
+ write_data(writers_schema.values, v, encoder)
549
+ end
550
+ end
551
+ encoder.write_long(0)
552
+ end
553
+
554
+ def write_union(writers_schema, datum, encoder)
555
+ index_of_schema = -1
556
+ found = writers_schema.schemas.
557
+ find{|e| index_of_schema += 1; found = Schema.validate(e, datum) }
558
+ unless found # Because find_index doesn't exist in 1.8.6
559
+ raise AvroTypeError.new(writers_schema, datum)
560
+ end
561
+ encoder.write_long(index_of_schema)
562
+ write_data(writers_schema.schemas[index_of_schema], datum, encoder)
563
+ end
564
+
565
+ def write_record(writers_schema, datum, encoder)
566
+ writers_schema.fields.each do |field|
567
+ write_data(field.type, datum[field.name], encoder)
568
+ end
569
+ end
570
+ end # DatumWriter
571
+ end
572
+ end