avro-jruby 1.7.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,615 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ module Avro
18
+ module IO
19
+ # Raised when datum is not an example of schema
20
+ class AvroTypeError < AvroError
21
+ def initialize(expected_schema, datum)
22
+ super("The datum #{datum.inspect} is not an example of schema #{expected_schema}")
23
+ end
24
+ end
25
+
26
+ # Raised when writer's and reader's schema do not match
27
+ class SchemaMatchException < AvroError
28
+ def initialize(writers_schema, readers_schema)
29
+ super("Writer's schema #{writers_schema} and Reader's schema " +
30
+ "#{readers_schema} do not match.")
31
+ end
32
+ end
33
+
34
+ # FIXME(jmhodges) move validate to this module?
35
+
36
+ class BinaryDecoder
37
+ # Read leaf values
38
+
39
+ # reader is an object on which we can call read, seek and tell.
40
+ attr_reader :reader
41
+ def initialize(reader)
42
+ @reader = reader
43
+ end
44
+
45
+ def byte!
46
+ @reader.read(1).unpack('C').first
47
+ end
48
+
49
+ def read_null
50
+ # null is written as zero byte's
51
+ nil
52
+ end
53
+
54
+ def read_boolean
55
+ byte! == 1
56
+ end
57
+
58
+ def read_int; read_long; end
59
+
60
+ def read_long
61
+ # int and long values are written using variable-length,
62
+ # zig-zag coding.
63
+ b = byte!
64
+ n = b & 0x7F
65
+ shift = 7
66
+ while (b & 0x80) != 0
67
+ b = byte!
68
+ n |= (b & 0x7F) << shift
69
+ shift += 7
70
+ end
71
+ (n >> 1) ^ -(n & 1)
72
+ end
73
+
74
+ def read_float
75
+ # A float is written as 4 bytes.
76
+ # The float is converted into a 32-bit integer using a method
77
+ # equivalent to Java's floatToIntBits and then encoded in
78
+ # little-endian format.
79
+ @reader.read(4).unpack('e')[0]
80
+ end
81
+
82
+ def read_double
83
+ # A double is written as 8 bytes.
84
+ # The double is converted into a 64-bit integer using a method
85
+ # equivalent to Java's doubleToLongBits and then encoded in
86
+ # little-endian format.
87
+ @reader.read(8).unpack('E')[0]
88
+ end
89
+
90
+ def read_bytes
91
+ # Bytes are encoded as a long followed by that many bytes of
92
+ # data.
93
+ read(read_long)
94
+ end
95
+
96
+ def read_string
97
+ # A string is encoded as a long followed by that many bytes of
98
+ # UTF-8 encoded character data.
99
+ read_bytes.tap do |string|
100
+ string.force_encoding("UTF-8") if string.respond_to? :force_encoding
101
+ end
102
+ end
103
+
104
+ def read(len)
105
+ # Read n bytes
106
+ @reader.read(len)
107
+ end
108
+
109
+ def skip_null
110
+ nil
111
+ end
112
+
113
+ def skip_boolean
114
+ skip(1)
115
+ end
116
+
117
+ def skip_int
118
+ skip_long
119
+ end
120
+
121
+ def skip_long
122
+ b = byte!
123
+ while (b & 0x80) != 0
124
+ b = byte!
125
+ end
126
+ end
127
+
128
+ def skip_float
129
+ skip(4)
130
+ end
131
+
132
+ def skip_double
133
+ skip(8)
134
+ end
135
+
136
+ def skip_bytes
137
+ skip(read_long)
138
+ end
139
+
140
+ def skip_string
141
+ skip_bytes
142
+ end
143
+
144
+ def skip(n)
145
+ reader.seek(reader.tell() + n)
146
+ end
147
+ end
148
+
149
+ # Write leaf values
150
+ class BinaryEncoder
151
+ attr_reader :writer
152
+
153
+ def initialize(writer)
154
+ @writer = writer
155
+ end
156
+
157
+ # null is written as zero bytes
158
+ def write_null(datum)
159
+ nil
160
+ end
161
+
162
+ # a boolean is written as a single byte
163
+ # whose value is either 0 (false) or 1 (true).
164
+ def write_boolean(datum)
165
+ on_disk = datum ? 1.chr : 0.chr
166
+ writer.write(on_disk)
167
+ end
168
+
169
+ # int and long values are written using variable-length,
170
+ # zig-zag coding.
171
+ def write_int(n)
172
+ write_long(n)
173
+ end
174
+
175
+ # int and long values are written using variable-length,
176
+ # zig-zag coding.
177
+ def write_long(n)
178
+ foo = n
179
+ n = (n << 1) ^ (n >> 63)
180
+ while (n & ~0x7F) != 0
181
+ @writer.write(((n & 0x7f) | 0x80).chr)
182
+ n >>= 7
183
+ end
184
+ @writer.write(n.chr)
185
+ end
186
+
187
+ # A float is written as 4 bytes.
188
+ # The float is converted into a 32-bit integer using a method
189
+ # equivalent to Java's floatToIntBits and then encoded in
190
+ # little-endian format.
191
+ def write_float(datum)
192
+ @writer.write([datum].pack('e'))
193
+ end
194
+
195
+ # A double is written as 8 bytes.
196
+ # The double is converted into a 64-bit integer using a method
197
+ # equivalent to Java's doubleToLongBits and then encoded in
198
+ # little-endian format.
199
+ def write_double(datum)
200
+ @writer.write([datum].pack('E'))
201
+ end
202
+
203
+ # Bytes are encoded as a long followed by that many bytes of data.
204
+ def write_bytes(datum)
205
+ write_long(datum.bytesize)
206
+ @writer.write(datum)
207
+ end
208
+
209
+ # A string is encoded as a long followed by that many bytes of
210
+ # UTF-8 encoded character data
211
+ def write_string(datum)
212
+ # FIXME utf-8 encode this in 1.9
213
+ write_bytes(datum)
214
+ end
215
+
216
+ # Write an arbritary datum.
217
+ def write(datum)
218
+ writer.write(datum)
219
+ end
220
+ end
221
+
222
+ class DatumReader
223
+ def self.match_schemas(writers_schema, readers_schema)
224
+ w_type = writers_schema.type_sym
225
+ r_type = readers_schema.type_sym
226
+
227
+ # This conditional is begging for some OO love.
228
+ if w_type == :union || r_type == :union
229
+ return true
230
+ end
231
+
232
+ if w_type == r_type
233
+ return true if Schema::PRIMITIVE_TYPES_SYM.include?(r_type)
234
+
235
+ case r_type
236
+ when :record
237
+ return writers_schema.fullname == readers_schema.fullname
238
+ when :error
239
+ return writers_schema.fullname == readers_schema.fullname
240
+ when :request
241
+ return true
242
+ when :fixed
243
+ return writers_schema.fullname == readers_schema.fullname &&
244
+ writers_schema.size == readers_schema.size
245
+ when :enum
246
+ return writers_schema.fullname == readers_schema.fullname
247
+ when :map
248
+ return writers_schema.values.type == readers_schema.values.type
249
+ when :array
250
+ return writers_schema.items.type == readers_schema.items.type
251
+ end
252
+ end
253
+
254
+ # Handle schema promotion
255
+ if w_type == :int && [:long, :float, :double].include?(r_type)
256
+ return true
257
+ elsif w_type == :long && [:float, :double].include?(r_type)
258
+ return true
259
+ elsif w_type == :float && r_type == :double
260
+ return true
261
+ end
262
+
263
+ return false
264
+ end
265
+
266
+ attr_accessor :writers_schema, :readers_schema
267
+
268
+ def initialize(writers_schema=nil, readers_schema=nil)
269
+ @writers_schema = writers_schema
270
+ @readers_schema = readers_schema
271
+ end
272
+
273
+ def read(decoder)
274
+ self.readers_schema = writers_schema unless readers_schema
275
+ read_data(writers_schema, readers_schema, decoder)
276
+ end
277
+
278
+ def read_data(writers_schema, readers_schema, decoder)
279
+ # schema matching
280
+ unless self.class.match_schemas(writers_schema, readers_schema)
281
+ raise SchemaMatchException.new(writers_schema, readers_schema)
282
+ end
283
+
284
+ # schema resolution: reader's schema is a union, writer's
285
+ # schema is not
286
+ if writers_schema.type_sym != :union && readers_schema.type_sym == :union
287
+ rs = readers_schema.schemas.find{|s|
288
+ self.class.match_schemas(writers_schema, s)
289
+ }
290
+ return read_data(writers_schema, rs, decoder) if rs
291
+ raise SchemaMatchException.new(writers_schema, readers_schema)
292
+ end
293
+
294
+ # function dispatch for reading data based on type of writer's
295
+ # schema
296
+ case writers_schema.type_sym
297
+ when :null; decoder.read_null
298
+ when :boolean; decoder.read_boolean
299
+ when :string; decoder.read_string
300
+ when :int; decoder.read_int
301
+ when :long; decoder.read_long
302
+ when :float; decoder.read_float
303
+ when :double; decoder.read_double
304
+ when :bytes; decoder.read_bytes
305
+ when :fixed; read_fixed(writers_schema, readers_schema, decoder)
306
+ when :enum; read_enum(writers_schema, readers_schema, decoder)
307
+ when :array; read_array(writers_schema, readers_schema, decoder)
308
+ when :map; read_map(writers_schema, readers_schema, decoder)
309
+ when :union; read_union(writers_schema, readers_schema, decoder)
310
+ when :record, :error, :request; read_record(writers_schema, readers_schema, decoder)
311
+ else
312
+ raise AvroError, "Cannot read unknown schema type: #{writers_schema.type}"
313
+ end
314
+ end
315
+
316
+ def read_fixed(writers_schema, readers_schema, decoder)
317
+ decoder.read(writers_schema.size)
318
+ end
319
+
320
+ def read_enum(writers_schema, readers_schema, decoder)
321
+ index_of_symbol = decoder.read_int
322
+ read_symbol = writers_schema.symbols[index_of_symbol]
323
+
324
+ # TODO(jmhodges): figure out what unset means for resolution
325
+ # schema resolution
326
+ unless readers_schema.symbols.include?(read_symbol)
327
+ # 'unset' here
328
+ end
329
+
330
+ read_symbol
331
+ end
332
+
333
+ def read_array(writers_schema, readers_schema, decoder)
334
+ read_items = []
335
+ block_count = decoder.read_long
336
+ while block_count != 0
337
+ if block_count < 0
338
+ block_count = -block_count
339
+ block_size = decoder.read_long
340
+ end
341
+ block_count.times do
342
+ read_items << read_data(writers_schema.items,
343
+ readers_schema.items,
344
+ decoder)
345
+ end
346
+ block_count = decoder.read_long
347
+ end
348
+
349
+ read_items
350
+ end
351
+
352
+ def read_map(writers_schema, readers_schema, decoder)
353
+ read_items = {}
354
+ block_count = decoder.read_long
355
+ while block_count != 0
356
+ if block_count < 0
357
+ block_count = -block_count
358
+ block_size = decoder.read_long
359
+ end
360
+ block_count.times do
361
+ key = decoder.read_string
362
+ read_items[key] = read_data(writers_schema.values,
363
+ readers_schema.values,
364
+ decoder)
365
+ end
366
+ block_count = decoder.read_long
367
+ end
368
+
369
+ read_items
370
+ end
371
+
372
+ def read_union(writers_schema, readers_schema, decoder)
373
+ index_of_schema = decoder.read_long
374
+ selected_writers_schema = writers_schema.schemas[index_of_schema]
375
+
376
+ read_data(selected_writers_schema, readers_schema, decoder)
377
+ end
378
+
379
+ def read_record(writers_schema, readers_schema, decoder)
380
+ readers_fields_hash = readers_schema.fields_hash
381
+ read_record = {}
382
+ writers_schema.fields.each do |field|
383
+ if readers_field = readers_fields_hash[field.name]
384
+ field_val = read_data(field.type, readers_field.type, decoder)
385
+ read_record[field.name] = field_val
386
+ else
387
+ skip_data(field.type, decoder)
388
+ end
389
+ end
390
+
391
+ # fill in the default values
392
+ if readers_fields_hash.size > read_record.size
393
+ writers_fields_hash = writers_schema.fields_hash
394
+ readers_fields_hash.each do |field_name, field|
395
+ unless writers_fields_hash.has_key? field_name
396
+ if !field.default.nil?
397
+ field_val = read_default_value(field.type, field.default)
398
+ read_record[field.name] = field_val
399
+ else
400
+ # FIXME(jmhodges) another 'unset' here
401
+ end
402
+ end
403
+ end
404
+ end
405
+
406
+ read_record
407
+ end
408
+
409
+ def read_default_value(field_schema, default_value)
410
+ # Basically a JSON Decoder?
411
+ case field_schema.type_sym
412
+ when :null
413
+ return nil
414
+ when :boolean
415
+ return default_value
416
+ when :int, :long
417
+ return Integer(default_value)
418
+ when :float, :double
419
+ return Float(default_value)
420
+ when :enum, :fixed, :string, :bytes
421
+ return default_value
422
+ when :array
423
+ read_array = []
424
+ default_value.each do |json_val|
425
+ item_val = read_default_value(field_schema.items, json_val)
426
+ read_array << item_val
427
+ end
428
+ return read_array
429
+ when :map
430
+ read_map = {}
431
+ default_value.each do |key, json_val|
432
+ map_val = read_default_value(field_schema.values, json_val)
433
+ read_map[key] = map_val
434
+ end
435
+ return read_map
436
+ when :union
437
+ return read_default_value(field_schema.schemas[0], default_value)
438
+ when :record, :error
439
+ read_record = {}
440
+ field_schema.fields.each do |field|
441
+ json_val = default_value[field.name]
442
+ json_val = field.default unless json_val
443
+ field_val = read_default_value(field.type, json_val)
444
+ read_record[field.name] = field_val
445
+ end
446
+ return read_record
447
+ else
448
+ fail_msg = "Unknown type: #{field_schema.type}"
449
+ raise AvroError, fail_msg
450
+ end
451
+ end
452
+
453
+ def skip_data(writers_schema, decoder)
454
+ case writers_schema.type_sym
455
+ when :null
456
+ decoder.skip_null
457
+ when :boolean
458
+ decoder.skip_boolean
459
+ when :string
460
+ decoder.skip_string
461
+ when :int
462
+ decoder.skip_int
463
+ when :long
464
+ decoder.skip_long
465
+ when :float
466
+ decoder.skip_float
467
+ when :double
468
+ decoder.skip_double
469
+ when :bytes
470
+ decoder.skip_bytes
471
+ when :fixed
472
+ skip_fixed(writers_schema, decoder)
473
+ when :enum
474
+ skip_enum(writers_schema, decoder)
475
+ when :array
476
+ skip_array(writers_schema, decoder)
477
+ when :map
478
+ skip_map(writers_schema, decoder)
479
+ when :union
480
+ skip_union(writers_schema, decoder)
481
+ when :record, :error, :request
482
+ skip_record(writers_schema, decoder)
483
+ else
484
+ raise AvroError, "Unknown schema type: #{writers_schema.type}"
485
+ end
486
+ end
487
+
488
+ def skip_fixed(writers_schema, decoder)
489
+ decoder.skip(writers_schema.size)
490
+ end
491
+
492
+ def skip_enum(writers_schema, decoder)
493
+ decoder.skip_int
494
+ end
495
+
496
+ def skip_union(writers_schema, decoder)
497
+ index = decoder.read_long
498
+ skip_data(writers_schema.schemas[index], decoder)
499
+ end
500
+
501
+ def skip_array(writers_schema, decoder)
502
+ skip_blocks(decoder) { skip_data(writers_schema.items, decoder) }
503
+ end
504
+
505
+ def skip_map(writers_schema, decoder)
506
+ skip_blocks(decoder) {
507
+ decoder.skip_string
508
+ skip_data(writers_schema.values, decoder)
509
+ }
510
+ end
511
+
512
+ def skip_record(writers_schema, decoder)
513
+ writers_schema.fields.each{|f| skip_data(f.type, decoder) }
514
+ end
515
+
516
+ private
517
+ def skip_blocks(decoder, &blk)
518
+ block_count = decoder.read_long
519
+ while block_count != 0
520
+ if block_count < 0
521
+ decoder.skip(decoder.read_long)
522
+ else
523
+ block_count.times &blk
524
+ end
525
+ block_count = decoder.read_long
526
+ end
527
+ end
528
+ end # DatumReader
529
+
530
+ # DatumWriter for generic ruby objects
531
+ class DatumWriter
532
+ attr_accessor :writers_schema
533
+ def initialize(writers_schema=nil)
534
+ @writers_schema = writers_schema
535
+ end
536
+
537
+ def write(datum, encoder)
538
+ write_data(writers_schema, datum, encoder)
539
+ end
540
+
541
+ def write_data(writers_schema, datum, encoder)
542
+ unless Schema.validate(writers_schema, datum)
543
+ raise AvroTypeError.new(writers_schema, datum)
544
+ end
545
+
546
+ # function dispatch to write datum
547
+ case writers_schema.type_sym
548
+ when :null; encoder.write_null(datum)
549
+ when :boolean; encoder.write_boolean(datum)
550
+ when :string; encoder.write_string(datum)
551
+ when :int; encoder.write_int(datum)
552
+ when :long; encoder.write_long(datum)
553
+ when :float; encoder.write_float(datum)
554
+ when :double; encoder.write_double(datum)
555
+ when :bytes; encoder.write_bytes(datum)
556
+ when :fixed; write_fixed(writers_schema, datum, encoder)
557
+ when :enum; write_enum(writers_schema, datum, encoder)
558
+ when :array; write_array(writers_schema, datum, encoder)
559
+ when :map; write_map(writers_schema, datum, encoder)
560
+ when :union; write_union(writers_schema, datum, encoder)
561
+ when :record, :error, :request; write_record(writers_schema, datum, encoder)
562
+ else
563
+ raise AvroError.new("Unknown type: #{writers_schema.type}")
564
+ end
565
+ end
566
+
567
+ def write_fixed(writers_schema, datum, encoder)
568
+ encoder.write(datum)
569
+ end
570
+
571
+ def write_enum(writers_schema, datum, encoder)
572
+ index_of_datum = writers_schema.symbols.index(datum)
573
+ encoder.write_int(index_of_datum)
574
+ end
575
+
576
+ def write_array(writers_schema, datum, encoder)
577
+ if datum.size > 0
578
+ encoder.write_long(datum.size)
579
+ datum.each do |item|
580
+ write_data(writers_schema.items, item, encoder)
581
+ end
582
+ end
583
+ encoder.write_long(0)
584
+ end
585
+
586
+ def write_map(writers_schema, datum, encoder)
587
+ if datum.size > 0
588
+ encoder.write_long(datum.size)
589
+ datum.each do |k,v|
590
+ encoder.write_string(k)
591
+ write_data(writers_schema.values, v, encoder)
592
+ end
593
+ end
594
+ encoder.write_long(0)
595
+ end
596
+
597
+ def write_union(writers_schema, datum, encoder)
598
+ index_of_schema = -1
599
+ found = writers_schema.schemas.
600
+ find{|e| index_of_schema += 1; found = Schema.validate(e, datum) }
601
+ unless found # Because find_index doesn't exist in 1.8.6
602
+ raise AvroTypeError.new(writers_schema, datum)
603
+ end
604
+ encoder.write_long(index_of_schema)
605
+ write_data(writers_schema.schemas[index_of_schema], datum, encoder)
606
+ end
607
+
608
+ def write_record(writers_schema, datum, encoder)
609
+ writers_schema.fields.each do |field|
610
+ write_data(field.type, datum[field.name], encoder)
611
+ end
612
+ end
613
+ end # DatumWriter
614
+ end
615
+ end