avro 1.7.4 → 1.7.5

Sign up to get free protection for your applications and to get access to all the features.
data/Rakefile CHANGED
@@ -45,6 +45,10 @@ task :generate_interop do
45
45
  ensure
46
46
  writer.close
47
47
  end
48
+
49
+ Avro::DataFile.open(BUILD + '/interop/data/ruby_deflate.avro', 'w', schema.to_s, :deflate) do |writer|
50
+ 20.times { writer << r.next }
51
+ end
48
52
  end
49
53
 
50
54
 
@@ -2,22 +2,22 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = "avro"
5
- s.version = "1.7.4"
5
+ s.version = "1.7.5"
6
6
 
7
7
  s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
8
8
  s.authors = ["Apache Software Foundation"]
9
- s.date = "2013-02-21"
9
+ s.date = "2013-08-19"
10
10
  s.description = "Avro is a data serialization and RPC format"
11
11
  s.email = "avro-dev@hadoop.apache.org"
12
12
  s.extra_rdoc_files = ["CHANGELOG", "lib/avro.rb", "lib/avro/collect_hash.rb", "lib/avro/data_file.rb", "lib/avro/io.rb", "lib/avro/ipc.rb", "lib/avro/protocol.rb", "lib/avro/schema.rb"]
13
- s.files = ["CHANGELOG", "Manifest", "Rakefile", "avro.gemspec", "interop/test_interop.rb", "lib/avro.rb", "lib/avro/collect_hash.rb", "lib/avro/data_file.rb", "lib/avro/io.rb", "lib/avro/ipc.rb", "lib/avro/protocol.rb", "lib/avro/schema.rb", "test/random_data.rb", "test/sample_ipc_client.rb", "test/sample_ipc_http_client.rb", "test/sample_ipc_http_server.rb", "test/sample_ipc_server.rb", "test/test_datafile.rb", "test/test_help.rb", "test/test_io.rb", "test/test_protocol.rb", "test/test_socket_transport.rb", "test/tool.rb"]
13
+ s.files = ["CHANGELOG", "Manifest", "Rakefile", "avro.gemspec", "interop/test_interop.rb", "lib/avro.rb", "lib/avro/collect_hash.rb", "lib/avro/data_file.rb", "lib/avro/io.rb", "lib/avro/ipc.rb", "lib/avro/protocol.rb", "lib/avro/schema.rb", "test/random_data.rb", "test/sample_ipc_client.rb", "test/sample_ipc_http_client.rb", "test/sample_ipc_http_server.rb", "test/sample_ipc_server.rb", "test/test_datafile.rb", "test/test_help.rb", "test/test_io.rb", "test/test_protocol.rb", "test/test_socket_transport.rb", "test/tool.rb", "test/test_schema.rb"]
14
14
  s.homepage = "http://hadoop.apache.org/avro/"
15
15
  s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Avro"]
16
16
  s.require_paths = ["lib"]
17
17
  s.rubyforge_project = "avro"
18
18
  s.rubygems_version = "1.8.15"
19
19
  s.summary = "Apache Avro for Ruby"
20
- s.test_files = ["test/test_datafile.rb", "test/test_help.rb", "test/test_protocol.rb", "test/test_socket_transport.rb", "test/test_io.rb"]
20
+ s.test_files = ["test/test_datafile.rb", "test/test_help.rb", "test/test_protocol.rb", "test/test_socket_transport.rb", "test/test_schema.rb", "test/test_io.rb"]
21
21
 
22
22
  if s.respond_to? :specification_version then
23
23
  s.specification_version = 3
@@ -23,7 +23,7 @@ class TestInterop < Test::Unit::TestCase
23
23
  HERE = File.expand_path(File.dirname(__FILE__))
24
24
  SHARE = HERE + '/../../../share'
25
25
  SCHEMAS = SHARE + '/test/schemas'
26
- Dir[HERE + '/../../../build/interop/data/*'].each do |fn|
26
+ Dir[HERE + '/../../../build/interop/data/*'].each do |fn|
27
27
  define_method("test_read_#{File.basename(fn, 'avro')}") do
28
28
  projection = Avro::Schema.parse(File.read(SCHEMAS+'/interop.avsc'))
29
29
 
@@ -19,6 +19,7 @@ require 'set'
19
19
  require 'digest/md5'
20
20
  require 'net/http'
21
21
  require 'stringio'
22
+ require 'zlib'
22
23
 
23
24
  module Avro
24
25
  VERSION = "FIXME"
@@ -24,19 +24,18 @@ module Avro
24
24
  SYNC_SIZE = 16
25
25
  SYNC_INTERVAL = 1000 * SYNC_SIZE
26
26
  META_SCHEMA = Schema.parse('{"type": "map", "values": "bytes"}')
27
- VALID_CODECS = ['null']
28
27
  VALID_ENCODINGS = ['binary'] # not used yet
29
28
 
30
29
  class DataFileError < AvroError; end
31
30
 
32
- def self.open(file_path, mode='r', schema=nil)
31
+ def self.open(file_path, mode='r', schema=nil, codec=nil)
33
32
  schema = Avro::Schema.parse(schema) if schema
34
33
  case mode
35
34
  when 'w'
36
35
  unless schema
37
36
  raise DataFileError, "Writing an Avro file requires a schema."
38
37
  end
39
- io = open_writer(File.open(file_path, 'wb'), schema)
38
+ io = open_writer(File.open(file_path, 'wb'), schema, codec)
40
39
  when 'r'
41
40
  io = open_reader(File.open(file_path, 'rb'), schema)
42
41
  else
@@ -49,11 +48,34 @@ module Avro
49
48
  io.close if block_given? && io
50
49
  end
51
50
 
51
+ def self.codecs
52
+ @codecs
53
+ end
54
+
55
+ def self.register_codec(codec)
56
+ @codecs ||= {}
57
+ codec = codec.new if !codec.respond_to?(:codec_name) && codec.is_a?(Class)
58
+ @codecs[codec.codec_name.to_s] = codec
59
+ end
60
+
61
+ def self.get_codec(codec)
62
+ codec ||= 'null'
63
+ if codec.respond_to?(:compress) && codec.respond_to?(:decompress)
64
+ codec # it's a codec instance
65
+ elsif codec.is_a?(Class)
66
+ codec.new # it's a codec class
67
+ elsif @codecs.include?(codec.to_s)
68
+ @codecs[codec.to_s] # it's a string or symbol (codec name)
69
+ else
70
+ raise DataFileError, "Unknown codec: #{codec.inspect}"
71
+ end
72
+ end
73
+
52
74
  class << self
53
75
  private
54
- def open_writer(file, schema)
76
+ def open_writer(file, schema, codec=nil)
55
77
  writer = Avro::IO::DatumWriter.new(schema)
56
- Avro::DataFile::Writer.new(file, writer, schema)
78
+ Avro::DataFile::Writer.new(file, writer, schema, codec)
57
79
  end
58
80
 
59
81
  def open_reader(file, schema)
@@ -67,10 +89,10 @@ module Avro
67
89
  OpenSSL::Random.random_bytes(16)
68
90
  end
69
91
 
70
- attr_reader :writer, :encoder, :datum_writer, :buffer_writer, :buffer_encoder, :sync_marker, :meta
92
+ attr_reader :writer, :encoder, :datum_writer, :buffer_writer, :buffer_encoder, :sync_marker, :meta, :codec
71
93
  attr_accessor :block_count
72
94
 
73
- def initialize(writer, datum_writer, writers_schema=nil)
95
+ def initialize(writer, datum_writer, writers_schema=nil, codec=nil)
74
96
  # If writers_schema is not present, presume we're appending
75
97
  @writer = writer
76
98
  @encoder = IO::BinaryEncoder.new(@writer)
@@ -83,7 +105,8 @@ module Avro
83
105
 
84
106
  if writers_schema
85
107
  @sync_marker = Writer.generate_sync_marker
86
- meta['avro.codec'] = 'null'
108
+ @codec = DataFile.get_codec(codec)
109
+ meta['avro.codec'] = @codec.codec_name.to_s
87
110
  meta['avro.schema'] = writers_schema.to_s
88
111
  datum_writer.writers_schema = writers_schema
89
112
  write_header
@@ -95,6 +118,7 @@ module Avro
95
118
  # collect metadata
96
119
  @sync_marker = dfr.sync_marker
97
120
  meta['avro.codec'] = dfr.meta['avro.codec']
121
+ @codec = DataFile.get_codec(meta['avro.codec'])
98
122
 
99
123
  # get schema used to write existing file
100
124
  schema_from_file = dfr.meta['avro.schema']
@@ -152,21 +176,15 @@ module Avro
152
176
 
153
177
  # TODO(jmhodges): make a schema for blocks and use datum_writer
154
178
  # TODO(jmhodges): do we really need the number of items in the block?
155
- # TODO(jmhodges): use codec when writing the block contents
156
179
  def write_block
157
180
  if block_count > 0
158
181
  # write number of items in block and block size in bytes
159
182
  encoder.write_long(block_count)
160
- to_write = buffer_writer.string
183
+ to_write = codec.compress(buffer_writer.string)
161
184
  encoder.write_long(to_write.size)
162
185
 
163
186
  # write block contents
164
- if meta['avro.codec'] == 'null'
165
- writer.write(to_write)
166
- else
167
- msg = "#{meta['avro.codec'].inspect} coded is not supported"
168
- raise DataFileError, msg
169
- end
187
+ writer.write(to_write)
170
188
 
171
189
  # write sync marker
172
190
  writer.write(sync_marker)
@@ -183,8 +201,14 @@ module Avro
183
201
  class Reader
184
202
  include ::Enumerable
185
203
 
186
- attr_reader :reader, :decoder, :datum_reader, :sync_marker, :meta, :file_length
187
- attr_accessor :block_count
204
+ # The reader and binary decoder for the raw file stream
205
+ attr_reader :reader, :decoder
206
+
207
+ # The binary decoder for the contents of a block (after codec decompression)
208
+ attr_reader :block_decoder
209
+
210
+ attr_reader :datum_reader, :sync_marker, :meta, :file_length, :codec
211
+ attr_accessor :block_count # records remaining in current block
188
212
 
189
213
  def initialize(reader, datum_reader)
190
214
  @reader = reader
@@ -194,11 +218,7 @@ module Avro
194
218
  # read the header: magic, meta, sync
195
219
  read_header
196
220
 
197
- # ensure the codec is valid
198
- codec_from_file = meta['avro.codec']
199
- if codec_from_file && ! VALID_CODECS.include?(codec_from_file)
200
- raise DataFileError, "Unknown codec: #{codec_from_file}"
201
- end
221
+ @codec = DataFile.get_codec(meta['avro.codec'])
202
222
 
203
223
  # get ready to read
204
224
  @block_count = 0
@@ -220,7 +240,7 @@ module Avro
220
240
  end
221
241
  end
222
242
 
223
- datum = datum_reader.read(decoder)
243
+ datum = datum_reader.read(block_decoder)
224
244
  self.block_count -= 1
225
245
  yield(datum)
226
246
  end
@@ -257,7 +277,9 @@ module Avro
257
277
 
258
278
  def read_block_header
259
279
  self.block_count = decoder.read_long
260
- decoder.read_long # not doing anything with length in bytes
280
+ block_bytes = decoder.read_long
281
+ data = codec.decompress(reader.read(block_bytes))
282
+ @block_decoder = IO::BinaryDecoder.new(StringIO.new(data))
261
283
  end
262
284
 
263
285
  # read the length of the sync marker; if it matches the sync
@@ -273,5 +295,48 @@ module Avro
273
295
  end
274
296
  end
275
297
  end
298
+
299
+
300
+ class NullCodec
301
+ def codec_name; 'null'; end
302
+ def decompress(data); data; end
303
+ def compress(data); data; end
304
+ end
305
+
306
+ class DeflateCodec
307
+ attr_reader :level
308
+
309
+ def initialize(level=Zlib::DEFAULT_COMPRESSION)
310
+ @level = level
311
+ end
312
+
313
+ def codec_name; 'deflate'; end
314
+
315
+ def decompress(compressed)
316
+ # Passing a negative number to Inflate puts it into "raw" RFC1951 mode
317
+ # (without the RFC1950 header & checksum). See the docs for
318
+ # inflateInit2 in http://www.zlib.net/manual.html
319
+ zstream = Zlib::Inflate.new(-Zlib::MAX_WBITS)
320
+ data = zstream.inflate(compressed)
321
+ data << zstream.finish
322
+ ensure
323
+ zstream.close
324
+ end
325
+
326
+ def compress(data)
327
+ zstream = Zlib::Deflate.new(level, -Zlib::MAX_WBITS)
328
+ compressed = zstream.deflate(data)
329
+ compressed << zstream.finish
330
+ ensure
331
+ zstream.close
332
+ end
333
+ end
334
+
335
+ DataFile.register_codec NullCodec
336
+ DataFile.register_codec DeflateCodec
337
+
338
+ # TODO this constant won't be updated if you register another codec.
339
+ # Deprecated in favor of Avro::DataFile::codecs
340
+ VALID_CODECS = DataFile.codecs.keys
276
341
  end
277
342
  end
@@ -220,51 +220,43 @@ module Avro
220
220
  end
221
221
 
222
222
  class DatumReader
223
- def self.check_props(schema_one, schema_two, prop_list)
224
- prop_list.all? do |prop|
225
- schema_one.send(prop) == schema_two.send(prop)
226
- end
227
- end
228
-
229
223
  def self.match_schemas(writers_schema, readers_schema)
230
- w_type = writers_schema.type
231
- r_type = readers_schema.type
224
+ w_type = writers_schema.type_sym
225
+ r_type = readers_schema.type_sym
232
226
 
233
227
  # This conditional is begging for some OO love.
234
- if w_type == 'union' || r_type == 'union'
228
+ if w_type == :union || r_type == :union
235
229
  return true
236
230
  end
237
231
 
238
232
  if w_type == r_type
239
- if Schema::PRIMITIVE_TYPES.include?(w_type) &&
240
- Schema::PRIMITIVE_TYPES.include?(r_type)
241
- return true
242
- end
233
+ return true if Schema::PRIMITIVE_TYPES_SYM.include?(r_type)
243
234
 
244
235
  case r_type
245
- when 'record'
246
- return check_props(writers_schema, readers_schema, [:fullname])
247
- when 'error'
248
- return check_props(writers_schema, readers_schema, [:fullname])
249
- when 'request'
236
+ when :record
237
+ return writers_schema.fullname == readers_schema.fullname
238
+ when :error
239
+ return writers_schema.fullname == readers_schema.fullname
240
+ when :request
250
241
  return true
251
- when 'fixed'
252
- return check_props(writers_schema, readers_schema, [:fullname, :size])
253
- when 'enum'
254
- return check_props(writers_schema, readers_schema, [:fullname])
255
- when 'map'
256
- return check_props(writers_schema.values, readers_schema.values, [:type])
257
- when 'array'
258
- return check_props(writers_schema.items, readers_schema.items, [:type])
242
+ when :fixed
243
+ return writers_schema.fullname == readers_schema.fullname &&
244
+ writers_schema.size == readers_schema.size
245
+ when :enum
246
+ return writers_schema.fullname == readers_schema.fullname
247
+ when :map
248
+ return writers_schema.values.type == readers_schema.values.type
249
+ when :array
250
+ return writers_schema.items.type == readers_schema.items.type
259
251
  end
260
252
  end
261
253
 
262
254
  # Handle schema promotion
263
- if w_type == 'int' && ['long', 'float', 'double'].include?(r_type)
255
+ if w_type == :int && [:long, :float, :double].include?(r_type)
264
256
  return true
265
- elsif w_type == 'long' && ['float', 'double'].include?(r_type)
257
+ elsif w_type == :long && [:float, :double].include?(r_type)
266
258
  return true
267
- elsif w_type == 'float' && r_type == 'double'
259
+ elsif w_type == :float && r_type == :double
268
260
  return true
269
261
  end
270
262
 
@@ -291,7 +283,7 @@ module Avro
291
283
 
292
284
  # schema resolution: reader's schema is a union, writer's
293
285
  # schema is not
294
- if writers_schema.type != 'union' && readers_schema.type == 'union'
286
+ if writers_schema.type_sym != :union && readers_schema.type_sym == :union
295
287
  rs = readers_schema.schemas.find{|s|
296
288
  self.class.match_schemas(writers_schema, s)
297
289
  }
@@ -301,21 +293,21 @@ module Avro
301
293
 
302
294
  # function dispatch for reading data based on type of writer's
303
295
  # schema
304
- case writers_schema.type
305
- when 'null'; decoder.read_null
306
- when 'boolean'; decoder.read_boolean
307
- when 'string'; decoder.read_string
308
- when 'int'; decoder.read_int
309
- when 'long'; decoder.read_long
310
- when 'float'; decoder.read_float
311
- when 'double'; decoder.read_double
312
- when 'bytes'; decoder.read_bytes
313
- when 'fixed'; read_fixed(writers_schema, readers_schema, decoder)
314
- when 'enum'; read_enum(writers_schema, readers_schema, decoder)
315
- when 'array'; read_array(writers_schema, readers_schema, decoder)
316
- when 'map'; read_map(writers_schema, readers_schema, decoder)
317
- when 'union'; read_union(writers_schema, readers_schema, decoder)
318
- when 'record', 'error', 'request'; read_record(writers_schema, readers_schema, decoder)
296
+ case writers_schema.type_sym
297
+ when :null; decoder.read_null
298
+ when :boolean; decoder.read_boolean
299
+ when :string; decoder.read_string
300
+ when :int; decoder.read_int
301
+ when :long; decoder.read_long
302
+ when :float; decoder.read_float
303
+ when :double; decoder.read_double
304
+ when :bytes; decoder.read_bytes
305
+ when :fixed; read_fixed(writers_schema, readers_schema, decoder)
306
+ when :enum; read_enum(writers_schema, readers_schema, decoder)
307
+ when :array; read_array(writers_schema, readers_schema, decoder)
308
+ when :map; read_map(writers_schema, readers_schema, decoder)
309
+ when :union; read_union(writers_schema, readers_schema, decoder)
310
+ when :record, :error, :request; read_record(writers_schema, readers_schema, decoder)
319
311
  else
320
312
  raise AvroError, "Cannot read unknown schema type: #{writers_schema.type}"
321
313
  end
@@ -416,34 +408,34 @@ module Avro
416
408
 
417
409
  def read_default_value(field_schema, default_value)
418
410
  # Basically a JSON Decoder?
419
- case field_schema.type
420
- when 'null'
411
+ case field_schema.type_sym
412
+ when :null
421
413
  return nil
422
- when 'boolean'
414
+ when :boolean
423
415
  return default_value
424
- when 'int', 'long'
416
+ when :int, :long
425
417
  return Integer(default_value)
426
- when 'float', 'double'
418
+ when :float, :double
427
419
  return Float(default_value)
428
- when 'enum', 'fixed', 'string', 'bytes'
420
+ when :enum, :fixed, :string, :bytes
429
421
  return default_value
430
- when 'array'
422
+ when :array
431
423
  read_array = []
432
424
  default_value.each do |json_val|
433
425
  item_val = read_default_value(field_schema.items, json_val)
434
426
  read_array << item_val
435
427
  end
436
428
  return read_array
437
- when 'map'
429
+ when :map
438
430
  read_map = {}
439
431
  default_value.each do |key, json_val|
440
432
  map_val = read_default_value(field_schema.values, json_val)
441
433
  read_map[key] = map_val
442
434
  end
443
435
  return read_map
444
- when 'union'
436
+ when :union
445
437
  return read_default_value(field_schema.schemas[0], default_value)
446
- when 'record', 'error'
438
+ when :record, :error
447
439
  read_record = {}
448
440
  field_schema.fields.each do |field|
449
441
  json_val = default_value[field.name]
@@ -459,37 +451,37 @@ module Avro
459
451
  end
460
452
 
461
453
  def skip_data(writers_schema, decoder)
462
- case writers_schema.type
463
- when 'null'
454
+ case writers_schema.type_sym
455
+ when :null
464
456
  decoder.skip_null
465
- when 'boolean'
457
+ when :boolean
466
458
  decoder.skip_boolean
467
- when 'string'
459
+ when :string
468
460
  decoder.skip_string
469
- when 'int'
461
+ when :int
470
462
  decoder.skip_int
471
- when 'long'
463
+ when :long
472
464
  decoder.skip_long
473
- when 'float'
465
+ when :float
474
466
  decoder.skip_float
475
- when 'double'
467
+ when :double
476
468
  decoder.skip_double
477
- when 'bytes'
469
+ when :bytes
478
470
  decoder.skip_bytes
479
- when 'fixed'
471
+ when :fixed
480
472
  skip_fixed(writers_schema, decoder)
481
- when 'enum'
473
+ when :enum
482
474
  skip_enum(writers_schema, decoder)
483
- when 'array'
475
+ when :array
484
476
  skip_array(writers_schema, decoder)
485
- when 'map'
477
+ when :map
486
478
  skip_map(writers_schema, decoder)
487
- when 'union'
479
+ when :union
488
480
  skip_union(writers_schema, decoder)
489
- when 'record', 'error', 'request'
481
+ when :record, :error, :request
490
482
  skip_record(writers_schema, decoder)
491
483
  else
492
- raise AvroError, "Unknown schema type: #{schm.type}"
484
+ raise AvroError, "Unknown schema type: #{writers_schema.type}"
493
485
  end
494
486
  end
495
487
 
@@ -552,21 +544,21 @@ module Avro
552
544
  end
553
545
 
554
546
  # function dispatch to write datum
555
- case writers_schema.type
556
- when 'null'; encoder.write_null(datum)
557
- when 'boolean'; encoder.write_boolean(datum)
558
- when 'string'; encoder.write_string(datum)
559
- when 'int'; encoder.write_int(datum)
560
- when 'long'; encoder.write_long(datum)
561
- when 'float'; encoder.write_float(datum)
562
- when 'double'; encoder.write_double(datum)
563
- when 'bytes'; encoder.write_bytes(datum)
564
- when 'fixed'; write_fixed(writers_schema, datum, encoder)
565
- when 'enum'; write_enum(writers_schema, datum, encoder)
566
- when 'array'; write_array(writers_schema, datum, encoder)
567
- when 'map'; write_map(writers_schema, datum, encoder)
568
- when 'union'; write_union(writers_schema, datum, encoder)
569
- when 'record', 'error', 'request'; write_record(writers_schema, datum, encoder)
547
+ case writers_schema.type_sym
548
+ when :null; encoder.write_null(datum)
549
+ when :boolean; encoder.write_boolean(datum)
550
+ when :string; encoder.write_string(datum)
551
+ when :int; encoder.write_int(datum)
552
+ when :long; encoder.write_long(datum)
553
+ when :float; encoder.write_float(datum)
554
+ when :double; encoder.write_double(datum)
555
+ when :bytes; encoder.write_bytes(datum)
556
+ when :fixed; write_fixed(writers_schema, datum, encoder)
557
+ when :enum; write_enum(writers_schema, datum, encoder)
558
+ when :array; write_array(writers_schema, datum, encoder)
559
+ when :map; write_map(writers_schema, datum, encoder)
560
+ when :union; write_union(writers_schema, datum, encoder)
561
+ when :record, :error, :request; write_record(writers_schema, datum, encoder)
570
562
  else
571
563
  raise AvroError.new("Unknown type: #{writers_schema.type}")
572
564
  end