avro-jruby 1.7.5

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,615 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ module Avro
18
+ module IO
19
+ # Raised when datum is not an example of schema
20
+ class AvroTypeError < AvroError
21
+ def initialize(expected_schema, datum)
22
+ super("The datum #{datum.inspect} is not an example of schema #{expected_schema}")
23
+ end
24
+ end
25
+
26
+ # Raised when writer's and reader's schema do not match
27
+ class SchemaMatchException < AvroError
28
+ def initialize(writers_schema, readers_schema)
29
+ super("Writer's schema #{writers_schema} and Reader's schema " +
30
+ "#{readers_schema} do not match.")
31
+ end
32
+ end
33
+
34
+ # FIXME(jmhodges) move validate to this module?
35
+
36
+ class BinaryDecoder
37
+ # Read leaf values
38
+
39
+ # reader is an object on which we can call read, seek and tell.
40
+ attr_reader :reader
41
+ def initialize(reader)
42
+ @reader = reader
43
+ end
44
+
45
+ def byte!
46
+ @reader.read(1).unpack('C').first
47
+ end
48
+
49
+ def read_null
50
+ # null is written as zero byte's
51
+ nil
52
+ end
53
+
54
+ def read_boolean
55
+ byte! == 1
56
+ end
57
+
58
+ def read_int; read_long; end
59
+
60
+ def read_long
61
+ # int and long values are written using variable-length,
62
+ # zig-zag coding.
63
+ b = byte!
64
+ n = b & 0x7F
65
+ shift = 7
66
+ while (b & 0x80) != 0
67
+ b = byte!
68
+ n |= (b & 0x7F) << shift
69
+ shift += 7
70
+ end
71
+ (n >> 1) ^ -(n & 1)
72
+ end
73
+
74
+ def read_float
75
+ # A float is written as 4 bytes.
76
+ # The float is converted into a 32-bit integer using a method
77
+ # equivalent to Java's floatToIntBits and then encoded in
78
+ # little-endian format.
79
+ @reader.read(4).unpack('e')[0]
80
+ end
81
+
82
+ def read_double
83
+ # A double is written as 8 bytes.
84
+ # The double is converted into a 64-bit integer using a method
85
+ # equivalent to Java's doubleToLongBits and then encoded in
86
+ # little-endian format.
87
+ @reader.read(8).unpack('E')[0]
88
+ end
89
+
90
+ def read_bytes
91
+ # Bytes are encoded as a long followed by that many bytes of
92
+ # data.
93
+ read(read_long)
94
+ end
95
+
96
+ def read_string
97
+ # A string is encoded as a long followed by that many bytes of
98
+ # UTF-8 encoded character data.
99
+ read_bytes.tap do |string|
100
+ string.force_encoding("UTF-8") if string.respond_to? :force_encoding
101
+ end
102
+ end
103
+
104
+ def read(len)
105
+ # Read n bytes
106
+ @reader.read(len)
107
+ end
108
+
109
+ def skip_null
110
+ nil
111
+ end
112
+
113
+ def skip_boolean
114
+ skip(1)
115
+ end
116
+
117
+ def skip_int
118
+ skip_long
119
+ end
120
+
121
+ def skip_long
122
+ b = byte!
123
+ while (b & 0x80) != 0
124
+ b = byte!
125
+ end
126
+ end
127
+
128
+ def skip_float
129
+ skip(4)
130
+ end
131
+
132
+ def skip_double
133
+ skip(8)
134
+ end
135
+
136
+ def skip_bytes
137
+ skip(read_long)
138
+ end
139
+
140
+ def skip_string
141
+ skip_bytes
142
+ end
143
+
144
+ def skip(n)
145
+ reader.seek(reader.tell() + n)
146
+ end
147
+ end
148
+
149
+ # Write leaf values
150
+ class BinaryEncoder
151
+ attr_reader :writer
152
+
153
+ def initialize(writer)
154
+ @writer = writer
155
+ end
156
+
157
+ # null is written as zero bytes
158
+ def write_null(datum)
159
+ nil
160
+ end
161
+
162
+ # a boolean is written as a single byte
163
+ # whose value is either 0 (false) or 1 (true).
164
+ def write_boolean(datum)
165
+ on_disk = datum ? 1.chr : 0.chr
166
+ writer.write(on_disk)
167
+ end
168
+
169
+ # int and long values are written using variable-length,
170
+ # zig-zag coding.
171
+ def write_int(n)
172
+ write_long(n)
173
+ end
174
+
175
+ # int and long values are written using variable-length,
176
+ # zig-zag coding.
177
+ def write_long(n)
178
+ foo = n
179
+ n = (n << 1) ^ (n >> 63)
180
+ while (n & ~0x7F) != 0
181
+ @writer.write(((n & 0x7f) | 0x80).chr)
182
+ n >>= 7
183
+ end
184
+ @writer.write(n.chr)
185
+ end
186
+
187
+ # A float is written as 4 bytes.
188
+ # The float is converted into a 32-bit integer using a method
189
+ # equivalent to Java's floatToIntBits and then encoded in
190
+ # little-endian format.
191
+ def write_float(datum)
192
+ @writer.write([datum].pack('e'))
193
+ end
194
+
195
+ # A double is written as 8 bytes.
196
+ # The double is converted into a 64-bit integer using a method
197
+ # equivalent to Java's doubleToLongBits and then encoded in
198
+ # little-endian format.
199
+ def write_double(datum)
200
+ @writer.write([datum].pack('E'))
201
+ end
202
+
203
+ # Bytes are encoded as a long followed by that many bytes of data.
204
+ def write_bytes(datum)
205
+ write_long(datum.bytesize)
206
+ @writer.write(datum)
207
+ end
208
+
209
+ # A string is encoded as a long followed by that many bytes of
210
+ # UTF-8 encoded character data
211
+ def write_string(datum)
212
+ # FIXME utf-8 encode this in 1.9
213
+ write_bytes(datum)
214
+ end
215
+
216
+ # Write an arbritary datum.
217
+ def write(datum)
218
+ writer.write(datum)
219
+ end
220
+ end
221
+
222
+ class DatumReader
223
+ def self.match_schemas(writers_schema, readers_schema)
224
+ w_type = writers_schema.type_sym
225
+ r_type = readers_schema.type_sym
226
+
227
+ # This conditional is begging for some OO love.
228
+ if w_type == :union || r_type == :union
229
+ return true
230
+ end
231
+
232
+ if w_type == r_type
233
+ return true if Schema::PRIMITIVE_TYPES_SYM.include?(r_type)
234
+
235
+ case r_type
236
+ when :record
237
+ return writers_schema.fullname == readers_schema.fullname
238
+ when :error
239
+ return writers_schema.fullname == readers_schema.fullname
240
+ when :request
241
+ return true
242
+ when :fixed
243
+ return writers_schema.fullname == readers_schema.fullname &&
244
+ writers_schema.size == readers_schema.size
245
+ when :enum
246
+ return writers_schema.fullname == readers_schema.fullname
247
+ when :map
248
+ return writers_schema.values.type == readers_schema.values.type
249
+ when :array
250
+ return writers_schema.items.type == readers_schema.items.type
251
+ end
252
+ end
253
+
254
+ # Handle schema promotion
255
+ if w_type == :int && [:long, :float, :double].include?(r_type)
256
+ return true
257
+ elsif w_type == :long && [:float, :double].include?(r_type)
258
+ return true
259
+ elsif w_type == :float && r_type == :double
260
+ return true
261
+ end
262
+
263
+ return false
264
+ end
265
+
266
+ attr_accessor :writers_schema, :readers_schema
267
+
268
+ def initialize(writers_schema=nil, readers_schema=nil)
269
+ @writers_schema = writers_schema
270
+ @readers_schema = readers_schema
271
+ end
272
+
273
+ def read(decoder)
274
+ self.readers_schema = writers_schema unless readers_schema
275
+ read_data(writers_schema, readers_schema, decoder)
276
+ end
277
+
278
+ def read_data(writers_schema, readers_schema, decoder)
279
+ # schema matching
280
+ unless self.class.match_schemas(writers_schema, readers_schema)
281
+ raise SchemaMatchException.new(writers_schema, readers_schema)
282
+ end
283
+
284
+ # schema resolution: reader's schema is a union, writer's
285
+ # schema is not
286
+ if writers_schema.type_sym != :union && readers_schema.type_sym == :union
287
+ rs = readers_schema.schemas.find{|s|
288
+ self.class.match_schemas(writers_schema, s)
289
+ }
290
+ return read_data(writers_schema, rs, decoder) if rs
291
+ raise SchemaMatchException.new(writers_schema, readers_schema)
292
+ end
293
+
294
+ # function dispatch for reading data based on type of writer's
295
+ # schema
296
+ case writers_schema.type_sym
297
+ when :null; decoder.read_null
298
+ when :boolean; decoder.read_boolean
299
+ when :string; decoder.read_string
300
+ when :int; decoder.read_int
301
+ when :long; decoder.read_long
302
+ when :float; decoder.read_float
303
+ when :double; decoder.read_double
304
+ when :bytes; decoder.read_bytes
305
+ when :fixed; read_fixed(writers_schema, readers_schema, decoder)
306
+ when :enum; read_enum(writers_schema, readers_schema, decoder)
307
+ when :array; read_array(writers_schema, readers_schema, decoder)
308
+ when :map; read_map(writers_schema, readers_schema, decoder)
309
+ when :union; read_union(writers_schema, readers_schema, decoder)
310
+ when :record, :error, :request; read_record(writers_schema, readers_schema, decoder)
311
+ else
312
+ raise AvroError, "Cannot read unknown schema type: #{writers_schema.type}"
313
+ end
314
+ end
315
+
316
+ def read_fixed(writers_schema, readers_schema, decoder)
317
+ decoder.read(writers_schema.size)
318
+ end
319
+
320
+ def read_enum(writers_schema, readers_schema, decoder)
321
+ index_of_symbol = decoder.read_int
322
+ read_symbol = writers_schema.symbols[index_of_symbol]
323
+
324
+ # TODO(jmhodges): figure out what unset means for resolution
325
+ # schema resolution
326
+ unless readers_schema.symbols.include?(read_symbol)
327
+ # 'unset' here
328
+ end
329
+
330
+ read_symbol
331
+ end
332
+
333
+ def read_array(writers_schema, readers_schema, decoder)
334
+ read_items = []
335
+ block_count = decoder.read_long
336
+ while block_count != 0
337
+ if block_count < 0
338
+ block_count = -block_count
339
+ block_size = decoder.read_long
340
+ end
341
+ block_count.times do
342
+ read_items << read_data(writers_schema.items,
343
+ readers_schema.items,
344
+ decoder)
345
+ end
346
+ block_count = decoder.read_long
347
+ end
348
+
349
+ read_items
350
+ end
351
+
352
+ def read_map(writers_schema, readers_schema, decoder)
353
+ read_items = {}
354
+ block_count = decoder.read_long
355
+ while block_count != 0
356
+ if block_count < 0
357
+ block_count = -block_count
358
+ block_size = decoder.read_long
359
+ end
360
+ block_count.times do
361
+ key = decoder.read_string
362
+ read_items[key] = read_data(writers_schema.values,
363
+ readers_schema.values,
364
+ decoder)
365
+ end
366
+ block_count = decoder.read_long
367
+ end
368
+
369
+ read_items
370
+ end
371
+
372
+ def read_union(writers_schema, readers_schema, decoder)
373
+ index_of_schema = decoder.read_long
374
+ selected_writers_schema = writers_schema.schemas[index_of_schema]
375
+
376
+ read_data(selected_writers_schema, readers_schema, decoder)
377
+ end
378
+
379
+ def read_record(writers_schema, readers_schema, decoder)
380
+ readers_fields_hash = readers_schema.fields_hash
381
+ read_record = {}
382
+ writers_schema.fields.each do |field|
383
+ if readers_field = readers_fields_hash[field.name]
384
+ field_val = read_data(field.type, readers_field.type, decoder)
385
+ read_record[field.name] = field_val
386
+ else
387
+ skip_data(field.type, decoder)
388
+ end
389
+ end
390
+
391
+ # fill in the default values
392
+ if readers_fields_hash.size > read_record.size
393
+ writers_fields_hash = writers_schema.fields_hash
394
+ readers_fields_hash.each do |field_name, field|
395
+ unless writers_fields_hash.has_key? field_name
396
+ if !field.default.nil?
397
+ field_val = read_default_value(field.type, field.default)
398
+ read_record[field.name] = field_val
399
+ else
400
+ # FIXME(jmhodges) another 'unset' here
401
+ end
402
+ end
403
+ end
404
+ end
405
+
406
+ read_record
407
+ end
408
+
409
+ def read_default_value(field_schema, default_value)
410
+ # Basically a JSON Decoder?
411
+ case field_schema.type_sym
412
+ when :null
413
+ return nil
414
+ when :boolean
415
+ return default_value
416
+ when :int, :long
417
+ return Integer(default_value)
418
+ when :float, :double
419
+ return Float(default_value)
420
+ when :enum, :fixed, :string, :bytes
421
+ return default_value
422
+ when :array
423
+ read_array = []
424
+ default_value.each do |json_val|
425
+ item_val = read_default_value(field_schema.items, json_val)
426
+ read_array << item_val
427
+ end
428
+ return read_array
429
+ when :map
430
+ read_map = {}
431
+ default_value.each do |key, json_val|
432
+ map_val = read_default_value(field_schema.values, json_val)
433
+ read_map[key] = map_val
434
+ end
435
+ return read_map
436
+ when :union
437
+ return read_default_value(field_schema.schemas[0], default_value)
438
+ when :record, :error
439
+ read_record = {}
440
+ field_schema.fields.each do |field|
441
+ json_val = default_value[field.name]
442
+ json_val = field.default unless json_val
443
+ field_val = read_default_value(field.type, json_val)
444
+ read_record[field.name] = field_val
445
+ end
446
+ return read_record
447
+ else
448
+ fail_msg = "Unknown type: #{field_schema.type}"
449
+ raise AvroError, fail_msg
450
+ end
451
+ end
452
+
453
+ def skip_data(writers_schema, decoder)
454
+ case writers_schema.type_sym
455
+ when :null
456
+ decoder.skip_null
457
+ when :boolean
458
+ decoder.skip_boolean
459
+ when :string
460
+ decoder.skip_string
461
+ when :int
462
+ decoder.skip_int
463
+ when :long
464
+ decoder.skip_long
465
+ when :float
466
+ decoder.skip_float
467
+ when :double
468
+ decoder.skip_double
469
+ when :bytes
470
+ decoder.skip_bytes
471
+ when :fixed
472
+ skip_fixed(writers_schema, decoder)
473
+ when :enum
474
+ skip_enum(writers_schema, decoder)
475
+ when :array
476
+ skip_array(writers_schema, decoder)
477
+ when :map
478
+ skip_map(writers_schema, decoder)
479
+ when :union
480
+ skip_union(writers_schema, decoder)
481
+ when :record, :error, :request
482
+ skip_record(writers_schema, decoder)
483
+ else
484
+ raise AvroError, "Unknown schema type: #{writers_schema.type}"
485
+ end
486
+ end
487
+
488
+ def skip_fixed(writers_schema, decoder)
489
+ decoder.skip(writers_schema.size)
490
+ end
491
+
492
+ def skip_enum(writers_schema, decoder)
493
+ decoder.skip_int
494
+ end
495
+
496
+ def skip_union(writers_schema, decoder)
497
+ index = decoder.read_long
498
+ skip_data(writers_schema.schemas[index], decoder)
499
+ end
500
+
501
+ def skip_array(writers_schema, decoder)
502
+ skip_blocks(decoder) { skip_data(writers_schema.items, decoder) }
503
+ end
504
+
505
+ def skip_map(writers_schema, decoder)
506
+ skip_blocks(decoder) {
507
+ decoder.skip_string
508
+ skip_data(writers_schema.values, decoder)
509
+ }
510
+ end
511
+
512
+ def skip_record(writers_schema, decoder)
513
+ writers_schema.fields.each{|f| skip_data(f.type, decoder) }
514
+ end
515
+
516
+ private
517
+ def skip_blocks(decoder, &blk)
518
+ block_count = decoder.read_long
519
+ while block_count != 0
520
+ if block_count < 0
521
+ decoder.skip(decoder.read_long)
522
+ else
523
+ block_count.times &blk
524
+ end
525
+ block_count = decoder.read_long
526
+ end
527
+ end
528
+ end # DatumReader
529
+
530
+ # DatumWriter for generic ruby objects
531
+ class DatumWriter
532
+ attr_accessor :writers_schema
533
+ def initialize(writers_schema=nil)
534
+ @writers_schema = writers_schema
535
+ end
536
+
537
+ def write(datum, encoder)
538
+ write_data(writers_schema, datum, encoder)
539
+ end
540
+
541
+ def write_data(writers_schema, datum, encoder)
542
+ unless Schema.validate(writers_schema, datum)
543
+ raise AvroTypeError.new(writers_schema, datum)
544
+ end
545
+
546
+ # function dispatch to write datum
547
+ case writers_schema.type_sym
548
+ when :null; encoder.write_null(datum)
549
+ when :boolean; encoder.write_boolean(datum)
550
+ when :string; encoder.write_string(datum)
551
+ when :int; encoder.write_int(datum)
552
+ when :long; encoder.write_long(datum)
553
+ when :float; encoder.write_float(datum)
554
+ when :double; encoder.write_double(datum)
555
+ when :bytes; encoder.write_bytes(datum)
556
+ when :fixed; write_fixed(writers_schema, datum, encoder)
557
+ when :enum; write_enum(writers_schema, datum, encoder)
558
+ when :array; write_array(writers_schema, datum, encoder)
559
+ when :map; write_map(writers_schema, datum, encoder)
560
+ when :union; write_union(writers_schema, datum, encoder)
561
+ when :record, :error, :request; write_record(writers_schema, datum, encoder)
562
+ else
563
+ raise AvroError.new("Unknown type: #{writers_schema.type}")
564
+ end
565
+ end
566
+
567
+ def write_fixed(writers_schema, datum, encoder)
568
+ encoder.write(datum)
569
+ end
570
+
571
+ def write_enum(writers_schema, datum, encoder)
572
+ index_of_datum = writers_schema.symbols.index(datum)
573
+ encoder.write_int(index_of_datum)
574
+ end
575
+
576
+ def write_array(writers_schema, datum, encoder)
577
+ if datum.size > 0
578
+ encoder.write_long(datum.size)
579
+ datum.each do |item|
580
+ write_data(writers_schema.items, item, encoder)
581
+ end
582
+ end
583
+ encoder.write_long(0)
584
+ end
585
+
586
+ def write_map(writers_schema, datum, encoder)
587
+ if datum.size > 0
588
+ encoder.write_long(datum.size)
589
+ datum.each do |k,v|
590
+ encoder.write_string(k)
591
+ write_data(writers_schema.values, v, encoder)
592
+ end
593
+ end
594
+ encoder.write_long(0)
595
+ end
596
+
597
+ def write_union(writers_schema, datum, encoder)
598
+ index_of_schema = -1
599
+ found = writers_schema.schemas.
600
+ find{|e| index_of_schema += 1; found = Schema.validate(e, datum) }
601
+ unless found # Because find_index doesn't exist in 1.8.6
602
+ raise AvroTypeError.new(writers_schema, datum)
603
+ end
604
+ encoder.write_long(index_of_schema)
605
+ write_data(writers_schema.schemas[index_of_schema], datum, encoder)
606
+ end
607
+
608
+ def write_record(writers_schema, datum, encoder)
609
+ writers_schema.fields.each do |field|
610
+ write_data(field.type, datum[field.name], encoder)
611
+ end
612
+ end
613
+ end # DatumWriter
614
+ end
615
+ end