avro-jruby 1.7.5
Sign up to get free protection for your applications and to get access to all the features.
- data/CHANGELOG +1 -0
- data/Manifest +23 -0
- data/Rakefile +63 -0
- data/avro-jruby.gemspec +34 -0
- data/interop/test_interop.rb +41 -0
- data/lib/avro.rb +42 -0
- data/lib/avro/collect_hash.rb +25 -0
- data/lib/avro/data_file.rb +342 -0
- data/lib/avro/io.rb +615 -0
- data/lib/avro/ipc.rb +550 -0
- data/lib/avro/protocol.rb +161 -0
- data/lib/avro/schema.rb +405 -0
- data/test/random_data.rb +90 -0
- data/test/sample_ipc_client.rb +85 -0
- data/test/sample_ipc_http_client.rb +84 -0
- data/test/sample_ipc_http_server.rb +79 -0
- data/test/sample_ipc_server.rb +92 -0
- data/test/test_datafile.rb +188 -0
- data/test/test_help.rb +23 -0
- data/test/test_io.rb +393 -0
- data/test/test_protocol.rb +199 -0
- data/test/test_schema.rb +134 -0
- data/test/test_socket_transport.rb +40 -0
- data/test/tool.rb +144 -0
- metadata +103 -0
data/CHANGELOG
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
v0.0.1 stuff
|
data/Manifest
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
CHANGELOG
|
2
|
+
Manifest
|
3
|
+
Rakefile
|
4
|
+
avro.gemspec
|
5
|
+
interop/test_interop.rb
|
6
|
+
lib/avro.rb
|
7
|
+
lib/avro/collect_hash.rb
|
8
|
+
lib/avro/data_file.rb
|
9
|
+
lib/avro/io.rb
|
10
|
+
lib/avro/ipc.rb
|
11
|
+
lib/avro/protocol.rb
|
12
|
+
lib/avro/schema.rb
|
13
|
+
test/random_data.rb
|
14
|
+
test/sample_ipc_client.rb
|
15
|
+
test/sample_ipc_http_client.rb
|
16
|
+
test/sample_ipc_http_server.rb
|
17
|
+
test/sample_ipc_server.rb
|
18
|
+
test/test_datafile.rb
|
19
|
+
test/test_help.rb
|
20
|
+
test/test_io.rb
|
21
|
+
test/test_protocol.rb
|
22
|
+
test/test_socket_transport.rb
|
23
|
+
test/tool.rb
|
data/Rakefile
ADDED
@@ -0,0 +1,63 @@
|
|
1
|
+
# Licensed to the Apache Software Foundation (ASF) under one
|
2
|
+
# or more contributor license agreements. See the NOTICE file
|
3
|
+
# distributed with this work for additional information
|
4
|
+
# regarding copyright ownership. The ASF licenses this file
|
5
|
+
# to you under the Apache License, Version 2.0 (the
|
6
|
+
# "License"); you may not use this file except in compliance
|
7
|
+
# with the License. You may obtain a copy of the License at
|
8
|
+
#
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
10
|
+
#
|
11
|
+
# Unless required by applicable law or agreed to in writing, software
|
12
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
13
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
14
|
+
# See the License for the specific language governing permissions and
|
15
|
+
# limitations under the License.
|
16
|
+
|
17
|
+
require 'rubygems'
|
18
|
+
require 'echoe'
|
19
|
+
VERSION = File.open('../../share/VERSION.txt').read.sub('-SNAPSHOT', '.pre1').chomp
|
20
|
+
Echoe.new('avro', VERSION) do |p|
|
21
|
+
p.author = "Apache Software Foundation"
|
22
|
+
p.email = "avro-dev@hadoop.apache.org"
|
23
|
+
p.summary = "Apache Avro for Ruby"
|
24
|
+
p.description = "Avro is a data serialization and RPC format"
|
25
|
+
p.url = "http://hadoop.apache.org/avro/"
|
26
|
+
p.runtime_dependencies = %w[multi-json]
|
27
|
+
end
|
28
|
+
|
29
|
+
t = Rake::TestTask.new(:interop)
|
30
|
+
t.pattern = 'interop/test*.rb'
|
31
|
+
|
32
|
+
task :generate_interop do
|
33
|
+
$:.unshift(HERE + '/lib')
|
34
|
+
$:.unshift(HERE + '/test')
|
35
|
+
require 'avro'
|
36
|
+
require 'random_data'
|
37
|
+
|
38
|
+
schema = Avro::Schema.parse(File.read(SCHEMAS + '/interop.avsc'))
|
39
|
+
r = RandomData.new(schema, ENV['SEED'])
|
40
|
+
f = File.open(BUILD + '/interop/data/ruby.avro', 'w')
|
41
|
+
writer = Avro::DataFile::Writer.new(f, Avro::IO::DatumWriter.new(schema), schema)
|
42
|
+
begin
|
43
|
+
writer << r.next
|
44
|
+
writer << r.next
|
45
|
+
ensure
|
46
|
+
writer.close
|
47
|
+
end
|
48
|
+
|
49
|
+
Avro::DataFile.open(BUILD + '/interop/data/ruby_deflate.avro', 'w', schema.to_s, :deflate) do |writer|
|
50
|
+
20.times { writer << r.next }
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
|
55
|
+
HERE = File.expand_path(File.dirname(__FILE__))
|
56
|
+
SHARE = HERE + '/../../share'
|
57
|
+
SCHEMAS = SHARE + '/test/schemas'
|
58
|
+
BUILD = HERE + '/../../build'
|
59
|
+
|
60
|
+
task :dist => [:gem] do
|
61
|
+
mkdir_p "../../dist/ruby"
|
62
|
+
cp "pkg/avro-#{VERSION}.gem", "../../dist/ruby"
|
63
|
+
end
|
data/avro-jruby.gemspec
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
# stub: avro 1.7.5 ruby lib
|
3
|
+
|
4
|
+
Gem::Specification.new do |s|
|
5
|
+
s.name = "avro-jruby"
|
6
|
+
s.version = "1.7.5"
|
7
|
+
|
8
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
|
9
|
+
s.authors = ["Apache Software Foundation"]
|
10
|
+
s.date = "2013-10-25"
|
11
|
+
s.description = "Avro is a data serialization and RPC format"
|
12
|
+
s.email = "avro-dev@hadoop.apache.org"
|
13
|
+
s.extra_rdoc_files = ["CHANGELOG", "lib/avro.rb", "lib/avro/collect_hash.rb", "lib/avro/data_file.rb", "lib/avro/io.rb", "lib/avro/ipc.rb", "lib/avro/protocol.rb", "lib/avro/schema.rb"]
|
14
|
+
s.files = ["CHANGELOG", "Manifest", "Rakefile", "avro-jruby.gemspec", "interop/test_interop.rb", "lib/avro.rb", "lib/avro/collect_hash.rb", "lib/avro/data_file.rb", "lib/avro/io.rb", "lib/avro/ipc.rb", "lib/avro/protocol.rb", "lib/avro/schema.rb", "test/random_data.rb", "test/sample_ipc_client.rb", "test/sample_ipc_http_client.rb", "test/sample_ipc_http_server.rb", "test/sample_ipc_server.rb", "test/test_datafile.rb", "test/test_help.rb", "test/test_io.rb", "test/test_protocol.rb", "test/test_socket_transport.rb", "test/tool.rb", "test/test_schema.rb"]
|
15
|
+
s.homepage = "https://github.com/aia/avro-gem-jruby"
|
16
|
+
s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Avro"]
|
17
|
+
s.require_paths = ["lib"]
|
18
|
+
s.rubyforge_project = "avro-jruby"
|
19
|
+
s.rubygems_version = "2.1.9"
|
20
|
+
s.summary = "Apache Avro for Ruby"
|
21
|
+
s.test_files = ["test/test_datafile.rb", "test/test_help.rb", "test/test_io.rb", "test/test_protocol.rb", "test/test_schema.rb", "test/test_socket_transport.rb"]
|
22
|
+
|
23
|
+
if s.respond_to? :specification_version then
|
24
|
+
s.specification_version = 4
|
25
|
+
|
26
|
+
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
27
|
+
s.add_runtime_dependency(%q<multi_json>, [">= 0"])
|
28
|
+
else
|
29
|
+
s.add_dependency(%q<multi_json>, [">= 0"])
|
30
|
+
end
|
31
|
+
else
|
32
|
+
s.add_dependency(%q<multi_json>, [">= 0"])
|
33
|
+
end
|
34
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# Licensed to the Apache Software Foundation (ASF) under one
|
3
|
+
# or more contributor license agreements. See the NOTICE file
|
4
|
+
# distributed with this work for additional information
|
5
|
+
# regarding copyright ownership. The ASF licenses this file
|
6
|
+
# to you under the Apache License, Version 2.0 (the
|
7
|
+
# "License"); you may not use this file except in compliance
|
8
|
+
# with the License. You may obtain a copy of the License at
|
9
|
+
#
|
10
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
11
|
+
#
|
12
|
+
# Unless required by applicable law or agreed to in writing, software
|
13
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
14
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
15
|
+
# See the License for the specific language governing permissions and
|
16
|
+
# limitations under the License.
|
17
|
+
|
18
|
+
require 'rubygems'
|
19
|
+
require 'test/unit'
|
20
|
+
require 'avro'
|
21
|
+
|
22
|
+
class TestInterop < Test::Unit::TestCase
|
23
|
+
HERE = File.expand_path(File.dirname(__FILE__))
|
24
|
+
SHARE = HERE + '/../../../share'
|
25
|
+
SCHEMAS = SHARE + '/test/schemas'
|
26
|
+
Dir[HERE + '/../../../build/interop/data/*'].each do |fn|
|
27
|
+
define_method("test_read_#{File.basename(fn, 'avro')}") do
|
28
|
+
projection = Avro::Schema.parse(File.read(SCHEMAS+'/interop.avsc'))
|
29
|
+
|
30
|
+
File.open(fn) do |f|
|
31
|
+
r = Avro::DataFile::Reader.new(f, Avro::IO::DatumReader.new(projection))
|
32
|
+
i = 0
|
33
|
+
r.each do |datum|
|
34
|
+
i += 1
|
35
|
+
assert_not_nil datum, "nil datum from #{fn}"
|
36
|
+
end
|
37
|
+
assert_not_equal 0, i, "no data read in from #{fn}"
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
data/lib/avro.rb
ADDED
@@ -0,0 +1,42 @@
|
|
1
|
+
# Licensed to the Apache Software Foundation (ASF) under one
|
2
|
+
# or more contributor license agreements. See the NOTICE file
|
3
|
+
# distributed with this work for additional information
|
4
|
+
# regarding copyright ownership. The ASF licenses this file
|
5
|
+
# to you under the Apache License, Version 2.0 (the
|
6
|
+
# "License"); you may not use this file except in compliance
|
7
|
+
# with the License. You may obtain a copy of the License at
|
8
|
+
#
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
10
|
+
#
|
11
|
+
# Unless required by applicable law or agreed to in writing, software
|
12
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
13
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
14
|
+
# See the License for the specific language governing permissions and
|
15
|
+
# limitations under the License.
|
16
|
+
|
17
|
+
require 'multi_json'
|
18
|
+
require 'set'
|
19
|
+
require 'digest/md5'
|
20
|
+
require 'net/http'
|
21
|
+
require 'stringio'
|
22
|
+
require 'zlib'
|
23
|
+
|
24
|
+
module Avro
|
25
|
+
VERSION = "FIXME"
|
26
|
+
|
27
|
+
class AvroError < StandardError; end
|
28
|
+
|
29
|
+
class AvroTypeError < Avro::AvroError
|
30
|
+
def initialize(schm=nil, datum=nil, msg=nil)
|
31
|
+
msg ||= "Not a #{schm.to_s}: #{datum}"
|
32
|
+
super(msg)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
require 'avro/collect_hash'
|
38
|
+
require 'avro/schema'
|
39
|
+
require 'avro/io'
|
40
|
+
require 'avro/data_file'
|
41
|
+
require 'avro/protocol'
|
42
|
+
require 'avro/ipc'
|
@@ -0,0 +1,25 @@
|
|
1
|
+
# Licensed to the Apache Software Foundation (ASF) under one
|
2
|
+
# or more contributor license agreements. See the NOTICE file
|
3
|
+
# distributed with this work for additional information
|
4
|
+
# regarding copyright ownership. The ASF licenses this file
|
5
|
+
# to you under the Apache License, Version 2.0 (the
|
6
|
+
# "License"); you may not use this file except in compliance
|
7
|
+
# with the License. You may obtain a copy of the License at
|
8
|
+
#
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
10
|
+
#
|
11
|
+
# Unless required by applicable law or agreed to in writing, software
|
12
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
13
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
14
|
+
# See the License for the specific language governing permissions and
|
15
|
+
# limitations under the License.
|
16
|
+
|
17
|
+
module Enumerable
|
18
|
+
def collect_hash
|
19
|
+
inject(Hash.new) do |memo, i|
|
20
|
+
k, v = yield(i)
|
21
|
+
memo[k] = v if k
|
22
|
+
memo
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,342 @@
|
|
1
|
+
# Licensed to the Apache Software Foundation (ASF) under one
|
2
|
+
# or more contributor license agreements. See the NOTICE file
|
3
|
+
# distributed with this work for additional information
|
4
|
+
# regarding copyright ownership. The ASF licenses this file
|
5
|
+
# to you under the Apache License, Version 2.0 (the
|
6
|
+
# "License"); you may not use this file except in compliance
|
7
|
+
# with the License. You may obtain a copy of the License at
|
8
|
+
#
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
10
|
+
#
|
11
|
+
# Unless required by applicable law or agreed to in writing, software
|
12
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
13
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
14
|
+
# See the License for the specific language governing permissions and
|
15
|
+
# limitations under the License.
|
16
|
+
|
17
|
+
require 'openssl'
|
18
|
+
|
19
|
+
module Avro
|
20
|
+
module DataFile
|
21
|
+
VERSION = 1
|
22
|
+
MAGIC = "Obj" + [VERSION].pack('c')
|
23
|
+
MAGIC_SIZE = MAGIC.size
|
24
|
+
SYNC_SIZE = 16
|
25
|
+
SYNC_INTERVAL = 1000 * SYNC_SIZE
|
26
|
+
META_SCHEMA = Schema.parse('{"type": "map", "values": "bytes"}')
|
27
|
+
VALID_ENCODINGS = ['binary'] # not used yet
|
28
|
+
|
29
|
+
class DataFileError < AvroError; end
|
30
|
+
|
31
|
+
def self.open(file_path, mode='r', schema=nil, codec=nil)
|
32
|
+
schema = Avro::Schema.parse(schema) if schema
|
33
|
+
case mode
|
34
|
+
when 'w'
|
35
|
+
unless schema
|
36
|
+
raise DataFileError, "Writing an Avro file requires a schema."
|
37
|
+
end
|
38
|
+
io = open_writer(File.open(file_path, 'wb'), schema, codec)
|
39
|
+
when 'r'
|
40
|
+
io = open_reader(File.open(file_path, 'rb'), schema)
|
41
|
+
else
|
42
|
+
raise DataFileError, "Only modes 'r' and 'w' allowed. You gave #{mode.inspect}."
|
43
|
+
end
|
44
|
+
|
45
|
+
yield io if block_given?
|
46
|
+
io
|
47
|
+
ensure
|
48
|
+
io.close if block_given? && io
|
49
|
+
end
|
50
|
+
|
51
|
+
def self.codecs
|
52
|
+
@codecs
|
53
|
+
end
|
54
|
+
|
55
|
+
def self.register_codec(codec)
|
56
|
+
@codecs ||= {}
|
57
|
+
codec = codec.new if !codec.respond_to?(:codec_name) && codec.is_a?(Class)
|
58
|
+
@codecs[codec.codec_name.to_s] = codec
|
59
|
+
end
|
60
|
+
|
61
|
+
def self.get_codec(codec)
|
62
|
+
codec ||= 'null'
|
63
|
+
if codec.respond_to?(:compress) && codec.respond_to?(:decompress)
|
64
|
+
codec # it's a codec instance
|
65
|
+
elsif codec.is_a?(Class)
|
66
|
+
codec.new # it's a codec class
|
67
|
+
elsif @codecs.include?(codec.to_s)
|
68
|
+
@codecs[codec.to_s] # it's a string or symbol (codec name)
|
69
|
+
else
|
70
|
+
raise DataFileError, "Unknown codec: #{codec.inspect}"
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
class << self
|
75
|
+
private
|
76
|
+
def open_writer(file, schema, codec=nil)
|
77
|
+
writer = Avro::IO::DatumWriter.new(schema)
|
78
|
+
Avro::DataFile::Writer.new(file, writer, schema, codec)
|
79
|
+
end
|
80
|
+
|
81
|
+
def open_reader(file, schema)
|
82
|
+
reader = Avro::IO::DatumReader.new(nil, schema)
|
83
|
+
Avro::DataFile::Reader.new(file, reader)
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
class Writer
|
88
|
+
def self.generate_sync_marker
|
89
|
+
OpenSSL::Random.random_bytes(16)
|
90
|
+
end
|
91
|
+
|
92
|
+
attr_reader :writer, :encoder, :datum_writer, :buffer_writer, :buffer_encoder, :sync_marker, :meta, :codec
|
93
|
+
attr_accessor :block_count
|
94
|
+
|
95
|
+
def initialize(writer, datum_writer, writers_schema=nil, codec=nil)
|
96
|
+
# If writers_schema is not present, presume we're appending
|
97
|
+
@writer = writer
|
98
|
+
@encoder = IO::BinaryEncoder.new(@writer)
|
99
|
+
@datum_writer = datum_writer
|
100
|
+
@buffer_writer = StringIO.new('', 'w')
|
101
|
+
@buffer_encoder = IO::BinaryEncoder.new(@buffer_writer)
|
102
|
+
@block_count = 0
|
103
|
+
|
104
|
+
@meta = {}
|
105
|
+
|
106
|
+
if writers_schema
|
107
|
+
@sync_marker = Writer.generate_sync_marker
|
108
|
+
@codec = DataFile.get_codec(codec)
|
109
|
+
meta['avro.codec'] = @codec.codec_name.to_s
|
110
|
+
meta['avro.schema'] = writers_schema.to_s
|
111
|
+
datum_writer.writers_schema = writers_schema
|
112
|
+
write_header
|
113
|
+
else
|
114
|
+
# open writer for reading to collect metadata
|
115
|
+
dfr = Reader.new(writer, Avro::IO::DatumReader.new)
|
116
|
+
|
117
|
+
# FIXME(jmhodges): collect arbitrary metadata
|
118
|
+
# collect metadata
|
119
|
+
@sync_marker = dfr.sync_marker
|
120
|
+
meta['avro.codec'] = dfr.meta['avro.codec']
|
121
|
+
@codec = DataFile.get_codec(meta['avro.codec'])
|
122
|
+
|
123
|
+
# get schema used to write existing file
|
124
|
+
schema_from_file = dfr.meta['avro.schema']
|
125
|
+
meta['avro.schema'] = schema_from_file
|
126
|
+
datum_writer.writers_schema = Schema.parse(schema_from_file)
|
127
|
+
|
128
|
+
# seek to the end of the file and prepare for writing
|
129
|
+
writer.seek(0,2)
|
130
|
+
end
|
131
|
+
end
|
132
|
+
|
133
|
+
# Append a datum to the file
|
134
|
+
def <<(datum)
|
135
|
+
datum_writer.write(datum, buffer_encoder)
|
136
|
+
self.block_count += 1
|
137
|
+
|
138
|
+
# if the data to write is larger than the sync interval, write
|
139
|
+
# the block
|
140
|
+
if buffer_writer.tell >= SYNC_INTERVAL
|
141
|
+
write_block
|
142
|
+
end
|
143
|
+
end
|
144
|
+
|
145
|
+
# Return the current position as a value that may be passed to
|
146
|
+
# DataFileReader.seek(long). Forces the end of the current block,
|
147
|
+
# emitting a synchronization marker.
|
148
|
+
def sync
|
149
|
+
write_block
|
150
|
+
writer.tell
|
151
|
+
end
|
152
|
+
|
153
|
+
# Flush the current state of the file, including metadata
|
154
|
+
def flush
|
155
|
+
write_block
|
156
|
+
writer.flush
|
157
|
+
end
|
158
|
+
|
159
|
+
def close
|
160
|
+
flush
|
161
|
+
writer.close
|
162
|
+
end
|
163
|
+
|
164
|
+
private
|
165
|
+
|
166
|
+
def write_header
|
167
|
+
# write magic
|
168
|
+
writer.write(MAGIC)
|
169
|
+
|
170
|
+
# write metadata
|
171
|
+
datum_writer.write_data(META_SCHEMA, meta, encoder)
|
172
|
+
|
173
|
+
# write sync marker
|
174
|
+
writer.write(sync_marker)
|
175
|
+
end
|
176
|
+
|
177
|
+
# TODO(jmhodges): make a schema for blocks and use datum_writer
|
178
|
+
# TODO(jmhodges): do we really need the number of items in the block?
|
179
|
+
def write_block
|
180
|
+
if block_count > 0
|
181
|
+
# write number of items in block and block size in bytes
|
182
|
+
encoder.write_long(block_count)
|
183
|
+
to_write = codec.compress(buffer_writer.string)
|
184
|
+
encoder.write_long(to_write.size)
|
185
|
+
|
186
|
+
# write block contents
|
187
|
+
writer.write(to_write)
|
188
|
+
|
189
|
+
# write sync marker
|
190
|
+
writer.write(sync_marker)
|
191
|
+
|
192
|
+
# reset buffer
|
193
|
+
buffer_writer.truncate(0)
|
194
|
+
buffer_writer.rewind
|
195
|
+
self.block_count = 0
|
196
|
+
end
|
197
|
+
end
|
198
|
+
end
|
199
|
+
|
200
|
+
# Read files written by DataFileWriter
|
201
|
+
class Reader
|
202
|
+
include ::Enumerable
|
203
|
+
|
204
|
+
# The reader and binary decoder for the raw file stream
|
205
|
+
attr_reader :reader, :decoder
|
206
|
+
|
207
|
+
# The binary decoder for the contents of a block (after codec decompression)
|
208
|
+
attr_reader :block_decoder
|
209
|
+
|
210
|
+
attr_reader :datum_reader, :sync_marker, :meta, :file_length, :codec
|
211
|
+
attr_accessor :block_count # records remaining in current block
|
212
|
+
|
213
|
+
def initialize(reader, datum_reader)
|
214
|
+
@reader = reader
|
215
|
+
@decoder = IO::BinaryDecoder.new(reader)
|
216
|
+
@datum_reader = datum_reader
|
217
|
+
|
218
|
+
# read the header: magic, meta, sync
|
219
|
+
read_header
|
220
|
+
|
221
|
+
@codec = DataFile.get_codec(meta['avro.codec'])
|
222
|
+
|
223
|
+
# get ready to read
|
224
|
+
@block_count = 0
|
225
|
+
datum_reader.writers_schema = Schema.parse meta['avro.schema']
|
226
|
+
end
|
227
|
+
|
228
|
+
# Iterates through each datum in this file
|
229
|
+
# TODO(jmhodges): handle block of length zero
|
230
|
+
def each
|
231
|
+
loop do
|
232
|
+
if block_count == 0
|
233
|
+
case
|
234
|
+
when eof?; break
|
235
|
+
when skip_sync
|
236
|
+
break if eof?
|
237
|
+
read_block_header
|
238
|
+
else
|
239
|
+
read_block_header
|
240
|
+
end
|
241
|
+
end
|
242
|
+
|
243
|
+
datum = datum_reader.read(block_decoder)
|
244
|
+
self.block_count -= 1
|
245
|
+
yield(datum)
|
246
|
+
end
|
247
|
+
end
|
248
|
+
|
249
|
+
def eof?; reader.eof?; end
|
250
|
+
|
251
|
+
def close
|
252
|
+
reader.close
|
253
|
+
end
|
254
|
+
|
255
|
+
private
|
256
|
+
def read_header
|
257
|
+
# seek to the beginning of the file to get magic block
|
258
|
+
reader.seek(0, 0)
|
259
|
+
|
260
|
+
# check magic number
|
261
|
+
magic_in_file = reader.read(MAGIC_SIZE)
|
262
|
+
if magic_in_file.size < MAGIC_SIZE
|
263
|
+
msg = 'Not an Avro data file: shorter than the Avro magic block'
|
264
|
+
raise DataFileError, msg
|
265
|
+
elsif magic_in_file != MAGIC
|
266
|
+
msg = "Not an Avro data file: #{magic_in_file.inspect} doesn't match #{MAGIC.inspect}"
|
267
|
+
raise DataFileError, msg
|
268
|
+
end
|
269
|
+
|
270
|
+
# read metadata
|
271
|
+
@meta = datum_reader.read_data(META_SCHEMA,
|
272
|
+
META_SCHEMA,
|
273
|
+
decoder)
|
274
|
+
# read sync marker
|
275
|
+
@sync_marker = reader.read(SYNC_SIZE)
|
276
|
+
end
|
277
|
+
|
278
|
+
def read_block_header
|
279
|
+
self.block_count = decoder.read_long
|
280
|
+
block_bytes = decoder.read_long
|
281
|
+
data = codec.decompress(reader.read(block_bytes))
|
282
|
+
@block_decoder = IO::BinaryDecoder.new(StringIO.new(data))
|
283
|
+
end
|
284
|
+
|
285
|
+
# read the length of the sync marker; if it matches the sync
|
286
|
+
# marker, return true. Otherwise, seek back to where we started
|
287
|
+
# and return false
|
288
|
+
def skip_sync
|
289
|
+
proposed_sync_marker = reader.read(SYNC_SIZE)
|
290
|
+
if proposed_sync_marker != sync_marker
|
291
|
+
reader.seek(-SYNC_SIZE, 1)
|
292
|
+
false
|
293
|
+
else
|
294
|
+
true
|
295
|
+
end
|
296
|
+
end
|
297
|
+
end
|
298
|
+
|
299
|
+
|
300
|
+
class NullCodec
|
301
|
+
def codec_name; 'null'; end
|
302
|
+
def decompress(data); data; end
|
303
|
+
def compress(data); data; end
|
304
|
+
end
|
305
|
+
|
306
|
+
class DeflateCodec
|
307
|
+
attr_reader :level
|
308
|
+
|
309
|
+
def initialize(level=Zlib::DEFAULT_COMPRESSION)
|
310
|
+
@level = level
|
311
|
+
end
|
312
|
+
|
313
|
+
def codec_name; 'deflate'; end
|
314
|
+
|
315
|
+
def decompress(compressed)
|
316
|
+
# Passing a negative number to Inflate puts it into "raw" RFC1951 mode
|
317
|
+
# (without the RFC1950 header & checksum). See the docs for
|
318
|
+
# inflateInit2 in http://www.zlib.net/manual.html
|
319
|
+
zstream = Zlib::Inflate.new(-Zlib::MAX_WBITS)
|
320
|
+
data = zstream.inflate(compressed)
|
321
|
+
data << zstream.finish
|
322
|
+
ensure
|
323
|
+
zstream.close
|
324
|
+
end
|
325
|
+
|
326
|
+
def compress(data)
|
327
|
+
zstream = Zlib::Deflate.new(level, -Zlib::MAX_WBITS)
|
328
|
+
compressed = zstream.deflate(data)
|
329
|
+
compressed << zstream.finish
|
330
|
+
ensure
|
331
|
+
zstream.close
|
332
|
+
end
|
333
|
+
end
|
334
|
+
|
335
|
+
DataFile.register_codec NullCodec
|
336
|
+
DataFile.register_codec DeflateCodec
|
337
|
+
|
338
|
+
# TODO this constant won't be updated if you register another codec.
|
339
|
+
# Deprecated in favor of Avro::DataFile::codecs
|
340
|
+
VALID_CODECS = DataFile.codecs.keys
|
341
|
+
end
|
342
|
+
end
|