avro 1.7.4 → 1.7.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Rakefile CHANGED
@@ -45,6 +45,10 @@ task :generate_interop do
45
45
  ensure
46
46
  writer.close
47
47
  end
48
+
49
+ Avro::DataFile.open(BUILD + '/interop/data/ruby_deflate.avro', 'w', schema.to_s, :deflate) do |writer|
50
+ 20.times { writer << r.next }
51
+ end
48
52
  end
49
53
 
50
54
 
@@ -2,22 +2,22 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = "avro"
5
- s.version = "1.7.4"
5
+ s.version = "1.7.5"
6
6
 
7
7
  s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
8
8
  s.authors = ["Apache Software Foundation"]
9
- s.date = "2013-02-21"
9
+ s.date = "2013-08-19"
10
10
  s.description = "Avro is a data serialization and RPC format"
11
11
  s.email = "avro-dev@hadoop.apache.org"
12
12
  s.extra_rdoc_files = ["CHANGELOG", "lib/avro.rb", "lib/avro/collect_hash.rb", "lib/avro/data_file.rb", "lib/avro/io.rb", "lib/avro/ipc.rb", "lib/avro/protocol.rb", "lib/avro/schema.rb"]
13
- s.files = ["CHANGELOG", "Manifest", "Rakefile", "avro.gemspec", "interop/test_interop.rb", "lib/avro.rb", "lib/avro/collect_hash.rb", "lib/avro/data_file.rb", "lib/avro/io.rb", "lib/avro/ipc.rb", "lib/avro/protocol.rb", "lib/avro/schema.rb", "test/random_data.rb", "test/sample_ipc_client.rb", "test/sample_ipc_http_client.rb", "test/sample_ipc_http_server.rb", "test/sample_ipc_server.rb", "test/test_datafile.rb", "test/test_help.rb", "test/test_io.rb", "test/test_protocol.rb", "test/test_socket_transport.rb", "test/tool.rb"]
13
+ s.files = ["CHANGELOG", "Manifest", "Rakefile", "avro.gemspec", "interop/test_interop.rb", "lib/avro.rb", "lib/avro/collect_hash.rb", "lib/avro/data_file.rb", "lib/avro/io.rb", "lib/avro/ipc.rb", "lib/avro/protocol.rb", "lib/avro/schema.rb", "test/random_data.rb", "test/sample_ipc_client.rb", "test/sample_ipc_http_client.rb", "test/sample_ipc_http_server.rb", "test/sample_ipc_server.rb", "test/test_datafile.rb", "test/test_help.rb", "test/test_io.rb", "test/test_protocol.rb", "test/test_socket_transport.rb", "test/tool.rb", "test/test_schema.rb"]
14
14
  s.homepage = "http://hadoop.apache.org/avro/"
15
15
  s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Avro"]
16
16
  s.require_paths = ["lib"]
17
17
  s.rubyforge_project = "avro"
18
18
  s.rubygems_version = "1.8.15"
19
19
  s.summary = "Apache Avro for Ruby"
20
- s.test_files = ["test/test_datafile.rb", "test/test_help.rb", "test/test_protocol.rb", "test/test_socket_transport.rb", "test/test_io.rb"]
20
+ s.test_files = ["test/test_datafile.rb", "test/test_help.rb", "test/test_protocol.rb", "test/test_socket_transport.rb", "test/test_schema.rb", "test/test_io.rb"]
21
21
 
22
22
  if s.respond_to? :specification_version then
23
23
  s.specification_version = 3
@@ -23,7 +23,7 @@ class TestInterop < Test::Unit::TestCase
23
23
  HERE = File.expand_path(File.dirname(__FILE__))
24
24
  SHARE = HERE + '/../../../share'
25
25
  SCHEMAS = SHARE + '/test/schemas'
26
- Dir[HERE + '/../../../build/interop/data/*'].each do |fn|
26
+ Dir[HERE + '/../../../build/interop/data/*'].each do |fn|
27
27
  define_method("test_read_#{File.basename(fn, 'avro')}") do
28
28
  projection = Avro::Schema.parse(File.read(SCHEMAS+'/interop.avsc'))
29
29
 
@@ -19,6 +19,7 @@ require 'set'
19
19
  require 'digest/md5'
20
20
  require 'net/http'
21
21
  require 'stringio'
22
+ require 'zlib'
22
23
 
23
24
  module Avro
24
25
  VERSION = "FIXME"
@@ -24,19 +24,18 @@ module Avro
24
24
  SYNC_SIZE = 16
25
25
  SYNC_INTERVAL = 1000 * SYNC_SIZE
26
26
  META_SCHEMA = Schema.parse('{"type": "map", "values": "bytes"}')
27
- VALID_CODECS = ['null']
28
27
  VALID_ENCODINGS = ['binary'] # not used yet
29
28
 
30
29
  class DataFileError < AvroError; end
31
30
 
32
- def self.open(file_path, mode='r', schema=nil)
31
+ def self.open(file_path, mode='r', schema=nil, codec=nil)
33
32
  schema = Avro::Schema.parse(schema) if schema
34
33
  case mode
35
34
  when 'w'
36
35
  unless schema
37
36
  raise DataFileError, "Writing an Avro file requires a schema."
38
37
  end
39
- io = open_writer(File.open(file_path, 'wb'), schema)
38
+ io = open_writer(File.open(file_path, 'wb'), schema, codec)
40
39
  when 'r'
41
40
  io = open_reader(File.open(file_path, 'rb'), schema)
42
41
  else
@@ -49,11 +48,34 @@ module Avro
49
48
  io.close if block_given? && io
50
49
  end
51
50
 
51
+ def self.codecs
52
+ @codecs
53
+ end
54
+
55
+ def self.register_codec(codec)
56
+ @codecs ||= {}
57
+ codec = codec.new if !codec.respond_to?(:codec_name) && codec.is_a?(Class)
58
+ @codecs[codec.codec_name.to_s] = codec
59
+ end
60
+
61
+ def self.get_codec(codec)
62
+ codec ||= 'null'
63
+ if codec.respond_to?(:compress) && codec.respond_to?(:decompress)
64
+ codec # it's a codec instance
65
+ elsif codec.is_a?(Class)
66
+ codec.new # it's a codec class
67
+ elsif @codecs.include?(codec.to_s)
68
+ @codecs[codec.to_s] # it's a string or symbol (codec name)
69
+ else
70
+ raise DataFileError, "Unknown codec: #{codec.inspect}"
71
+ end
72
+ end
73
+
52
74
  class << self
53
75
  private
54
- def open_writer(file, schema)
76
+ def open_writer(file, schema, codec=nil)
55
77
  writer = Avro::IO::DatumWriter.new(schema)
56
- Avro::DataFile::Writer.new(file, writer, schema)
78
+ Avro::DataFile::Writer.new(file, writer, schema, codec)
57
79
  end
58
80
 
59
81
  def open_reader(file, schema)
@@ -67,10 +89,10 @@ module Avro
67
89
  OpenSSL::Random.random_bytes(16)
68
90
  end
69
91
 
70
- attr_reader :writer, :encoder, :datum_writer, :buffer_writer, :buffer_encoder, :sync_marker, :meta
92
+ attr_reader :writer, :encoder, :datum_writer, :buffer_writer, :buffer_encoder, :sync_marker, :meta, :codec
71
93
  attr_accessor :block_count
72
94
 
73
- def initialize(writer, datum_writer, writers_schema=nil)
95
+ def initialize(writer, datum_writer, writers_schema=nil, codec=nil)
74
96
  # If writers_schema is not present, presume we're appending
75
97
  @writer = writer
76
98
  @encoder = IO::BinaryEncoder.new(@writer)
@@ -83,7 +105,8 @@ module Avro
83
105
 
84
106
  if writers_schema
85
107
  @sync_marker = Writer.generate_sync_marker
86
- meta['avro.codec'] = 'null'
108
+ @codec = DataFile.get_codec(codec)
109
+ meta['avro.codec'] = @codec.codec_name.to_s
87
110
  meta['avro.schema'] = writers_schema.to_s
88
111
  datum_writer.writers_schema = writers_schema
89
112
  write_header
@@ -95,6 +118,7 @@ module Avro
95
118
  # collect metadata
96
119
  @sync_marker = dfr.sync_marker
97
120
  meta['avro.codec'] = dfr.meta['avro.codec']
121
+ @codec = DataFile.get_codec(meta['avro.codec'])
98
122
 
99
123
  # get schema used to write existing file
100
124
  schema_from_file = dfr.meta['avro.schema']
@@ -152,21 +176,15 @@ module Avro
152
176
 
153
177
  # TODO(jmhodges): make a schema for blocks and use datum_writer
154
178
  # TODO(jmhodges): do we really need the number of items in the block?
155
- # TODO(jmhodges): use codec when writing the block contents
156
179
  def write_block
157
180
  if block_count > 0
158
181
  # write number of items in block and block size in bytes
159
182
  encoder.write_long(block_count)
160
- to_write = buffer_writer.string
183
+ to_write = codec.compress(buffer_writer.string)
161
184
  encoder.write_long(to_write.size)
162
185
 
163
186
  # write block contents
164
- if meta['avro.codec'] == 'null'
165
- writer.write(to_write)
166
- else
167
- msg = "#{meta['avro.codec'].inspect} coded is not supported"
168
- raise DataFileError, msg
169
- end
187
+ writer.write(to_write)
170
188
 
171
189
  # write sync marker
172
190
  writer.write(sync_marker)
@@ -183,8 +201,14 @@ module Avro
183
201
  class Reader
184
202
  include ::Enumerable
185
203
 
186
- attr_reader :reader, :decoder, :datum_reader, :sync_marker, :meta, :file_length
187
- attr_accessor :block_count
204
+ # The reader and binary decoder for the raw file stream
205
+ attr_reader :reader, :decoder
206
+
207
+ # The binary decoder for the contents of a block (after codec decompression)
208
+ attr_reader :block_decoder
209
+
210
+ attr_reader :datum_reader, :sync_marker, :meta, :file_length, :codec
211
+ attr_accessor :block_count # records remaining in current block
188
212
 
189
213
  def initialize(reader, datum_reader)
190
214
  @reader = reader
@@ -194,11 +218,7 @@ module Avro
194
218
  # read the header: magic, meta, sync
195
219
  read_header
196
220
 
197
- # ensure the codec is valid
198
- codec_from_file = meta['avro.codec']
199
- if codec_from_file && ! VALID_CODECS.include?(codec_from_file)
200
- raise DataFileError, "Unknown codec: #{codec_from_file}"
201
- end
221
+ @codec = DataFile.get_codec(meta['avro.codec'])
202
222
 
203
223
  # get ready to read
204
224
  @block_count = 0
@@ -220,7 +240,7 @@ module Avro
220
240
  end
221
241
  end
222
242
 
223
- datum = datum_reader.read(decoder)
243
+ datum = datum_reader.read(block_decoder)
224
244
  self.block_count -= 1
225
245
  yield(datum)
226
246
  end
@@ -257,7 +277,9 @@ module Avro
257
277
 
258
278
  def read_block_header
259
279
  self.block_count = decoder.read_long
260
- decoder.read_long # not doing anything with length in bytes
280
+ block_bytes = decoder.read_long
281
+ data = codec.decompress(reader.read(block_bytes))
282
+ @block_decoder = IO::BinaryDecoder.new(StringIO.new(data))
261
283
  end
262
284
 
263
285
  # read the length of the sync marker; if it matches the sync
@@ -273,5 +295,48 @@ module Avro
273
295
  end
274
296
  end
275
297
  end
298
+
299
+
300
+ class NullCodec
301
+ def codec_name; 'null'; end
302
+ def decompress(data); data; end
303
+ def compress(data); data; end
304
+ end
305
+
306
+ class DeflateCodec
307
+ attr_reader :level
308
+
309
+ def initialize(level=Zlib::DEFAULT_COMPRESSION)
310
+ @level = level
311
+ end
312
+
313
+ def codec_name; 'deflate'; end
314
+
315
+ def decompress(compressed)
316
+ # Passing a negative number to Inflate puts it into "raw" RFC1951 mode
317
+ # (without the RFC1950 header & checksum). See the docs for
318
+ # inflateInit2 in http://www.zlib.net/manual.html
319
+ zstream = Zlib::Inflate.new(-Zlib::MAX_WBITS)
320
+ data = zstream.inflate(compressed)
321
+ data << zstream.finish
322
+ ensure
323
+ zstream.close
324
+ end
325
+
326
+ def compress(data)
327
+ zstream = Zlib::Deflate.new(level, -Zlib::MAX_WBITS)
328
+ compressed = zstream.deflate(data)
329
+ compressed << zstream.finish
330
+ ensure
331
+ zstream.close
332
+ end
333
+ end
334
+
335
+ DataFile.register_codec NullCodec
336
+ DataFile.register_codec DeflateCodec
337
+
338
+ # TODO this constant won't be updated if you register another codec.
339
+ # Deprecated in favor of Avro::DataFile::codecs
340
+ VALID_CODECS = DataFile.codecs.keys
276
341
  end
277
342
  end
@@ -220,51 +220,43 @@ module Avro
220
220
  end
221
221
 
222
222
  class DatumReader
223
- def self.check_props(schema_one, schema_two, prop_list)
224
- prop_list.all? do |prop|
225
- schema_one.send(prop) == schema_two.send(prop)
226
- end
227
- end
228
-
229
223
  def self.match_schemas(writers_schema, readers_schema)
230
- w_type = writers_schema.type
231
- r_type = readers_schema.type
224
+ w_type = writers_schema.type_sym
225
+ r_type = readers_schema.type_sym
232
226
 
233
227
  # This conditional is begging for some OO love.
234
- if w_type == 'union' || r_type == 'union'
228
+ if w_type == :union || r_type == :union
235
229
  return true
236
230
  end
237
231
 
238
232
  if w_type == r_type
239
- if Schema::PRIMITIVE_TYPES.include?(w_type) &&
240
- Schema::PRIMITIVE_TYPES.include?(r_type)
241
- return true
242
- end
233
+ return true if Schema::PRIMITIVE_TYPES_SYM.include?(r_type)
243
234
 
244
235
  case r_type
245
- when 'record'
246
- return check_props(writers_schema, readers_schema, [:fullname])
247
- when 'error'
248
- return check_props(writers_schema, readers_schema, [:fullname])
249
- when 'request'
236
+ when :record
237
+ return writers_schema.fullname == readers_schema.fullname
238
+ when :error
239
+ return writers_schema.fullname == readers_schema.fullname
240
+ when :request
250
241
  return true
251
- when 'fixed'
252
- return check_props(writers_schema, readers_schema, [:fullname, :size])
253
- when 'enum'
254
- return check_props(writers_schema, readers_schema, [:fullname])
255
- when 'map'
256
- return check_props(writers_schema.values, readers_schema.values, [:type])
257
- when 'array'
258
- return check_props(writers_schema.items, readers_schema.items, [:type])
242
+ when :fixed
243
+ return writers_schema.fullname == readers_schema.fullname &&
244
+ writers_schema.size == readers_schema.size
245
+ when :enum
246
+ return writers_schema.fullname == readers_schema.fullname
247
+ when :map
248
+ return writers_schema.values.type == readers_schema.values.type
249
+ when :array
250
+ return writers_schema.items.type == readers_schema.items.type
259
251
  end
260
252
  end
261
253
 
262
254
  # Handle schema promotion
263
- if w_type == 'int' && ['long', 'float', 'double'].include?(r_type)
255
+ if w_type == :int && [:long, :float, :double].include?(r_type)
264
256
  return true
265
- elsif w_type == 'long' && ['float', 'double'].include?(r_type)
257
+ elsif w_type == :long && [:float, :double].include?(r_type)
266
258
  return true
267
- elsif w_type == 'float' && r_type == 'double'
259
+ elsif w_type == :float && r_type == :double
268
260
  return true
269
261
  end
270
262
 
@@ -291,7 +283,7 @@ module Avro
291
283
 
292
284
  # schema resolution: reader's schema is a union, writer's
293
285
  # schema is not
294
- if writers_schema.type != 'union' && readers_schema.type == 'union'
286
+ if writers_schema.type_sym != :union && readers_schema.type_sym == :union
295
287
  rs = readers_schema.schemas.find{|s|
296
288
  self.class.match_schemas(writers_schema, s)
297
289
  }
@@ -301,21 +293,21 @@ module Avro
301
293
 
302
294
  # function dispatch for reading data based on type of writer's
303
295
  # schema
304
- case writers_schema.type
305
- when 'null'; decoder.read_null
306
- when 'boolean'; decoder.read_boolean
307
- when 'string'; decoder.read_string
308
- when 'int'; decoder.read_int
309
- when 'long'; decoder.read_long
310
- when 'float'; decoder.read_float
311
- when 'double'; decoder.read_double
312
- when 'bytes'; decoder.read_bytes
313
- when 'fixed'; read_fixed(writers_schema, readers_schema, decoder)
314
- when 'enum'; read_enum(writers_schema, readers_schema, decoder)
315
- when 'array'; read_array(writers_schema, readers_schema, decoder)
316
- when 'map'; read_map(writers_schema, readers_schema, decoder)
317
- when 'union'; read_union(writers_schema, readers_schema, decoder)
318
- when 'record', 'error', 'request'; read_record(writers_schema, readers_schema, decoder)
296
+ case writers_schema.type_sym
297
+ when :null; decoder.read_null
298
+ when :boolean; decoder.read_boolean
299
+ when :string; decoder.read_string
300
+ when :int; decoder.read_int
301
+ when :long; decoder.read_long
302
+ when :float; decoder.read_float
303
+ when :double; decoder.read_double
304
+ when :bytes; decoder.read_bytes
305
+ when :fixed; read_fixed(writers_schema, readers_schema, decoder)
306
+ when :enum; read_enum(writers_schema, readers_schema, decoder)
307
+ when :array; read_array(writers_schema, readers_schema, decoder)
308
+ when :map; read_map(writers_schema, readers_schema, decoder)
309
+ when :union; read_union(writers_schema, readers_schema, decoder)
310
+ when :record, :error, :request; read_record(writers_schema, readers_schema, decoder)
319
311
  else
320
312
  raise AvroError, "Cannot read unknown schema type: #{writers_schema.type}"
321
313
  end
@@ -416,34 +408,34 @@ module Avro
416
408
 
417
409
  def read_default_value(field_schema, default_value)
418
410
  # Basically a JSON Decoder?
419
- case field_schema.type
420
- when 'null'
411
+ case field_schema.type_sym
412
+ when :null
421
413
  return nil
422
- when 'boolean'
414
+ when :boolean
423
415
  return default_value
424
- when 'int', 'long'
416
+ when :int, :long
425
417
  return Integer(default_value)
426
- when 'float', 'double'
418
+ when :float, :double
427
419
  return Float(default_value)
428
- when 'enum', 'fixed', 'string', 'bytes'
420
+ when :enum, :fixed, :string, :bytes
429
421
  return default_value
430
- when 'array'
422
+ when :array
431
423
  read_array = []
432
424
  default_value.each do |json_val|
433
425
  item_val = read_default_value(field_schema.items, json_val)
434
426
  read_array << item_val
435
427
  end
436
428
  return read_array
437
- when 'map'
429
+ when :map
438
430
  read_map = {}
439
431
  default_value.each do |key, json_val|
440
432
  map_val = read_default_value(field_schema.values, json_val)
441
433
  read_map[key] = map_val
442
434
  end
443
435
  return read_map
444
- when 'union'
436
+ when :union
445
437
  return read_default_value(field_schema.schemas[0], default_value)
446
- when 'record', 'error'
438
+ when :record, :error
447
439
  read_record = {}
448
440
  field_schema.fields.each do |field|
449
441
  json_val = default_value[field.name]
@@ -459,37 +451,37 @@ module Avro
459
451
  end
460
452
 
461
453
  def skip_data(writers_schema, decoder)
462
- case writers_schema.type
463
- when 'null'
454
+ case writers_schema.type_sym
455
+ when :null
464
456
  decoder.skip_null
465
- when 'boolean'
457
+ when :boolean
466
458
  decoder.skip_boolean
467
- when 'string'
459
+ when :string
468
460
  decoder.skip_string
469
- when 'int'
461
+ when :int
470
462
  decoder.skip_int
471
- when 'long'
463
+ when :long
472
464
  decoder.skip_long
473
- when 'float'
465
+ when :float
474
466
  decoder.skip_float
475
- when 'double'
467
+ when :double
476
468
  decoder.skip_double
477
- when 'bytes'
469
+ when :bytes
478
470
  decoder.skip_bytes
479
- when 'fixed'
471
+ when :fixed
480
472
  skip_fixed(writers_schema, decoder)
481
- when 'enum'
473
+ when :enum
482
474
  skip_enum(writers_schema, decoder)
483
- when 'array'
475
+ when :array
484
476
  skip_array(writers_schema, decoder)
485
- when 'map'
477
+ when :map
486
478
  skip_map(writers_schema, decoder)
487
- when 'union'
479
+ when :union
488
480
  skip_union(writers_schema, decoder)
489
- when 'record', 'error', 'request'
481
+ when :record, :error, :request
490
482
  skip_record(writers_schema, decoder)
491
483
  else
492
- raise AvroError, "Unknown schema type: #{schm.type}"
484
+ raise AvroError, "Unknown schema type: #{writers_schema.type}"
493
485
  end
494
486
  end
495
487
 
@@ -552,21 +544,21 @@ module Avro
552
544
  end
553
545
 
554
546
  # function dispatch to write datum
555
- case writers_schema.type
556
- when 'null'; encoder.write_null(datum)
557
- when 'boolean'; encoder.write_boolean(datum)
558
- when 'string'; encoder.write_string(datum)
559
- when 'int'; encoder.write_int(datum)
560
- when 'long'; encoder.write_long(datum)
561
- when 'float'; encoder.write_float(datum)
562
- when 'double'; encoder.write_double(datum)
563
- when 'bytes'; encoder.write_bytes(datum)
564
- when 'fixed'; write_fixed(writers_schema, datum, encoder)
565
- when 'enum'; write_enum(writers_schema, datum, encoder)
566
- when 'array'; write_array(writers_schema, datum, encoder)
567
- when 'map'; write_map(writers_schema, datum, encoder)
568
- when 'union'; write_union(writers_schema, datum, encoder)
569
- when 'record', 'error', 'request'; write_record(writers_schema, datum, encoder)
547
+ case writers_schema.type_sym
548
+ when :null; encoder.write_null(datum)
549
+ when :boolean; encoder.write_boolean(datum)
550
+ when :string; encoder.write_string(datum)
551
+ when :int; encoder.write_int(datum)
552
+ when :long; encoder.write_long(datum)
553
+ when :float; encoder.write_float(datum)
554
+ when :double; encoder.write_double(datum)
555
+ when :bytes; encoder.write_bytes(datum)
556
+ when :fixed; write_fixed(writers_schema, datum, encoder)
557
+ when :enum; write_enum(writers_schema, datum, encoder)
558
+ when :array; write_array(writers_schema, datum, encoder)
559
+ when :map; write_map(writers_schema, datum, encoder)
560
+ when :union; write_union(writers_schema, datum, encoder)
561
+ when :record, :error, :request; write_record(writers_schema, datum, encoder)
570
562
  else
571
563
  raise AvroError.new("Unknown type: #{writers_schema.type}")
572
564
  end