avro-jruby 1.7.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/CHANGELOG +1 -0
- data/Manifest +23 -0
- data/Rakefile +63 -0
- data/avro-jruby.gemspec +34 -0
- data/interop/test_interop.rb +41 -0
- data/lib/avro.rb +42 -0
- data/lib/avro/collect_hash.rb +25 -0
- data/lib/avro/data_file.rb +342 -0
- data/lib/avro/io.rb +615 -0
- data/lib/avro/ipc.rb +550 -0
- data/lib/avro/protocol.rb +161 -0
- data/lib/avro/schema.rb +405 -0
- data/test/random_data.rb +90 -0
- data/test/sample_ipc_client.rb +85 -0
- data/test/sample_ipc_http_client.rb +84 -0
- data/test/sample_ipc_http_server.rb +79 -0
- data/test/sample_ipc_server.rb +92 -0
- data/test/test_datafile.rb +188 -0
- data/test/test_help.rb +23 -0
- data/test/test_io.rb +393 -0
- data/test/test_protocol.rb +199 -0
- data/test/test_schema.rb +134 -0
- data/test/test_socket_transport.rb +40 -0
- data/test/tool.rb +144 -0
- metadata +103 -0
data/CHANGELOG
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
v0.0.1 stuff
|
data/Manifest
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
CHANGELOG
|
2
|
+
Manifest
|
3
|
+
Rakefile
|
4
|
+
avro.gemspec
|
5
|
+
interop/test_interop.rb
|
6
|
+
lib/avro.rb
|
7
|
+
lib/avro/collect_hash.rb
|
8
|
+
lib/avro/data_file.rb
|
9
|
+
lib/avro/io.rb
|
10
|
+
lib/avro/ipc.rb
|
11
|
+
lib/avro/protocol.rb
|
12
|
+
lib/avro/schema.rb
|
13
|
+
test/random_data.rb
|
14
|
+
test/sample_ipc_client.rb
|
15
|
+
test/sample_ipc_http_client.rb
|
16
|
+
test/sample_ipc_http_server.rb
|
17
|
+
test/sample_ipc_server.rb
|
18
|
+
test/test_datafile.rb
|
19
|
+
test/test_help.rb
|
20
|
+
test/test_io.rb
|
21
|
+
test/test_protocol.rb
|
22
|
+
test/test_socket_transport.rb
|
23
|
+
test/tool.rb
|
data/Rakefile
ADDED
@@ -0,0 +1,63 @@
|
|
1
|
+
# Licensed to the Apache Software Foundation (ASF) under one
|
2
|
+
# or more contributor license agreements. See the NOTICE file
|
3
|
+
# distributed with this work for additional information
|
4
|
+
# regarding copyright ownership. The ASF licenses this file
|
5
|
+
# to you under the Apache License, Version 2.0 (the
|
6
|
+
# "License"); you may not use this file except in compliance
|
7
|
+
# with the License. You may obtain a copy of the License at
|
8
|
+
#
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
10
|
+
#
|
11
|
+
# Unless required by applicable law or agreed to in writing, software
|
12
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
13
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
14
|
+
# See the License for the specific language governing permissions and
|
15
|
+
# limitations under the License.
|
16
|
+
|
17
|
+
require 'rubygems'
|
18
|
+
require 'echoe'
|
19
|
+
VERSION = File.open('../../share/VERSION.txt').read.sub('-SNAPSHOT', '.pre1').chomp
|
20
|
+
Echoe.new('avro', VERSION) do |p|
|
21
|
+
p.author = "Apache Software Foundation"
|
22
|
+
p.email = "avro-dev@hadoop.apache.org"
|
23
|
+
p.summary = "Apache Avro for Ruby"
|
24
|
+
p.description = "Avro is a data serialization and RPC format"
|
25
|
+
p.url = "http://hadoop.apache.org/avro/"
|
26
|
+
p.runtime_dependencies = %w[multi-json]
|
27
|
+
end
|
28
|
+
|
29
|
+
t = Rake::TestTask.new(:interop)
|
30
|
+
t.pattern = 'interop/test*.rb'
|
31
|
+
|
32
|
+
task :generate_interop do
|
33
|
+
$:.unshift(HERE + '/lib')
|
34
|
+
$:.unshift(HERE + '/test')
|
35
|
+
require 'avro'
|
36
|
+
require 'random_data'
|
37
|
+
|
38
|
+
schema = Avro::Schema.parse(File.read(SCHEMAS + '/interop.avsc'))
|
39
|
+
r = RandomData.new(schema, ENV['SEED'])
|
40
|
+
f = File.open(BUILD + '/interop/data/ruby.avro', 'w')
|
41
|
+
writer = Avro::DataFile::Writer.new(f, Avro::IO::DatumWriter.new(schema), schema)
|
42
|
+
begin
|
43
|
+
writer << r.next
|
44
|
+
writer << r.next
|
45
|
+
ensure
|
46
|
+
writer.close
|
47
|
+
end
|
48
|
+
|
49
|
+
Avro::DataFile.open(BUILD + '/interop/data/ruby_deflate.avro', 'w', schema.to_s, :deflate) do |writer|
|
50
|
+
20.times { writer << r.next }
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
|
55
|
+
HERE = File.expand_path(File.dirname(__FILE__))
|
56
|
+
SHARE = HERE + '/../../share'
|
57
|
+
SCHEMAS = SHARE + '/test/schemas'
|
58
|
+
BUILD = HERE + '/../../build'
|
59
|
+
|
60
|
+
task :dist => [:gem] do
|
61
|
+
mkdir_p "../../dist/ruby"
|
62
|
+
cp "pkg/avro-#{VERSION}.gem", "../../dist/ruby"
|
63
|
+
end
|
data/avro-jruby.gemspec
ADDED
@@ -0,0 +1,34 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
# stub: avro 1.7.5 ruby lib
|
3
|
+
|
4
|
+
Gem::Specification.new do |s|
|
5
|
+
s.name = "avro-jruby"
|
6
|
+
s.version = "1.7.5"
|
7
|
+
|
8
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
|
9
|
+
s.authors = ["Apache Software Foundation"]
|
10
|
+
s.date = "2013-10-25"
|
11
|
+
s.description = "Avro is a data serialization and RPC format"
|
12
|
+
s.email = "avro-dev@hadoop.apache.org"
|
13
|
+
s.extra_rdoc_files = ["CHANGELOG", "lib/avro.rb", "lib/avro/collect_hash.rb", "lib/avro/data_file.rb", "lib/avro/io.rb", "lib/avro/ipc.rb", "lib/avro/protocol.rb", "lib/avro/schema.rb"]
|
14
|
+
s.files = ["CHANGELOG", "Manifest", "Rakefile", "avro-jruby.gemspec", "interop/test_interop.rb", "lib/avro.rb", "lib/avro/collect_hash.rb", "lib/avro/data_file.rb", "lib/avro/io.rb", "lib/avro/ipc.rb", "lib/avro/protocol.rb", "lib/avro/schema.rb", "test/random_data.rb", "test/sample_ipc_client.rb", "test/sample_ipc_http_client.rb", "test/sample_ipc_http_server.rb", "test/sample_ipc_server.rb", "test/test_datafile.rb", "test/test_help.rb", "test/test_io.rb", "test/test_protocol.rb", "test/test_socket_transport.rb", "test/tool.rb", "test/test_schema.rb"]
|
15
|
+
s.homepage = "https://github.com/aia/avro-gem-jruby"
|
16
|
+
s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Avro"]
|
17
|
+
s.require_paths = ["lib"]
|
18
|
+
s.rubyforge_project = "avro-jruby"
|
19
|
+
s.rubygems_version = "2.1.9"
|
20
|
+
s.summary = "Apache Avro for Ruby"
|
21
|
+
s.test_files = ["test/test_datafile.rb", "test/test_help.rb", "test/test_io.rb", "test/test_protocol.rb", "test/test_schema.rb", "test/test_socket_transport.rb"]
|
22
|
+
|
23
|
+
if s.respond_to? :specification_version then
|
24
|
+
s.specification_version = 4
|
25
|
+
|
26
|
+
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
27
|
+
s.add_runtime_dependency(%q<multi_json>, [">= 0"])
|
28
|
+
else
|
29
|
+
s.add_dependency(%q<multi_json>, [">= 0"])
|
30
|
+
end
|
31
|
+
else
|
32
|
+
s.add_dependency(%q<multi_json>, [">= 0"])
|
33
|
+
end
|
34
|
+
end
|
@@ -0,0 +1,41 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# Licensed to the Apache Software Foundation (ASF) under one
|
3
|
+
# or more contributor license agreements. See the NOTICE file
|
4
|
+
# distributed with this work for additional information
|
5
|
+
# regarding copyright ownership. The ASF licenses this file
|
6
|
+
# to you under the Apache License, Version 2.0 (the
|
7
|
+
# "License"); you may not use this file except in compliance
|
8
|
+
# with the License. You may obtain a copy of the License at
|
9
|
+
#
|
10
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
11
|
+
#
|
12
|
+
# Unless required by applicable law or agreed to in writing, software
|
13
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
14
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
15
|
+
# See the License for the specific language governing permissions and
|
16
|
+
# limitations under the License.
|
17
|
+
|
18
|
+
require 'rubygems'
|
19
|
+
require 'test/unit'
|
20
|
+
require 'avro'
|
21
|
+
|
22
|
+
class TestInterop < Test::Unit::TestCase
|
23
|
+
HERE = File.expand_path(File.dirname(__FILE__))
|
24
|
+
SHARE = HERE + '/../../../share'
|
25
|
+
SCHEMAS = SHARE + '/test/schemas'
|
26
|
+
Dir[HERE + '/../../../build/interop/data/*'].each do |fn|
|
27
|
+
define_method("test_read_#{File.basename(fn, 'avro')}") do
|
28
|
+
projection = Avro::Schema.parse(File.read(SCHEMAS+'/interop.avsc'))
|
29
|
+
|
30
|
+
File.open(fn) do |f|
|
31
|
+
r = Avro::DataFile::Reader.new(f, Avro::IO::DatumReader.new(projection))
|
32
|
+
i = 0
|
33
|
+
r.each do |datum|
|
34
|
+
i += 1
|
35
|
+
assert_not_nil datum, "nil datum from #{fn}"
|
36
|
+
end
|
37
|
+
assert_not_equal 0, i, "no data read in from #{fn}"
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
data/lib/avro.rb
ADDED
@@ -0,0 +1,42 @@
|
|
1
|
+
# Licensed to the Apache Software Foundation (ASF) under one
|
2
|
+
# or more contributor license agreements. See the NOTICE file
|
3
|
+
# distributed with this work for additional information
|
4
|
+
# regarding copyright ownership. The ASF licenses this file
|
5
|
+
# to you under the Apache License, Version 2.0 (the
|
6
|
+
# "License"); you may not use this file except in compliance
|
7
|
+
# with the License. You may obtain a copy of the License at
|
8
|
+
#
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
10
|
+
#
|
11
|
+
# Unless required by applicable law or agreed to in writing, software
|
12
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
13
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
14
|
+
# See the License for the specific language governing permissions and
|
15
|
+
# limitations under the License.
|
16
|
+
|
17
|
+
require 'multi_json'
|
18
|
+
require 'set'
|
19
|
+
require 'digest/md5'
|
20
|
+
require 'net/http'
|
21
|
+
require 'stringio'
|
22
|
+
require 'zlib'
|
23
|
+
|
24
|
+
module Avro
|
25
|
+
VERSION = "FIXME"
|
26
|
+
|
27
|
+
class AvroError < StandardError; end
|
28
|
+
|
29
|
+
class AvroTypeError < Avro::AvroError
|
30
|
+
def initialize(schm=nil, datum=nil, msg=nil)
|
31
|
+
msg ||= "Not a #{schm.to_s}: #{datum}"
|
32
|
+
super(msg)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
require 'avro/collect_hash'
|
38
|
+
require 'avro/schema'
|
39
|
+
require 'avro/io'
|
40
|
+
require 'avro/data_file'
|
41
|
+
require 'avro/protocol'
|
42
|
+
require 'avro/ipc'
|
@@ -0,0 +1,25 @@
|
|
1
|
+
# Licensed to the Apache Software Foundation (ASF) under one
|
2
|
+
# or more contributor license agreements. See the NOTICE file
|
3
|
+
# distributed with this work for additional information
|
4
|
+
# regarding copyright ownership. The ASF licenses this file
|
5
|
+
# to you under the Apache License, Version 2.0 (the
|
6
|
+
# "License"); you may not use this file except in compliance
|
7
|
+
# with the License. You may obtain a copy of the License at
|
8
|
+
#
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
10
|
+
#
|
11
|
+
# Unless required by applicable law or agreed to in writing, software
|
12
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
13
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
14
|
+
# See the License for the specific language governing permissions and
|
15
|
+
# limitations under the License.
|
16
|
+
|
17
|
+
module Enumerable
|
18
|
+
def collect_hash
|
19
|
+
inject(Hash.new) do |memo, i|
|
20
|
+
k, v = yield(i)
|
21
|
+
memo[k] = v if k
|
22
|
+
memo
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,342 @@
|
|
1
|
+
# Licensed to the Apache Software Foundation (ASF) under one
|
2
|
+
# or more contributor license agreements. See the NOTICE file
|
3
|
+
# distributed with this work for additional information
|
4
|
+
# regarding copyright ownership. The ASF licenses this file
|
5
|
+
# to you under the Apache License, Version 2.0 (the
|
6
|
+
# "License"); you may not use this file except in compliance
|
7
|
+
# with the License. You may obtain a copy of the License at
|
8
|
+
#
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
10
|
+
#
|
11
|
+
# Unless required by applicable law or agreed to in writing, software
|
12
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
13
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
14
|
+
# See the License for the specific language governing permissions and
|
15
|
+
# limitations under the License.
|
16
|
+
|
17
|
+
require 'openssl'
|
18
|
+
|
19
|
+
module Avro
|
20
|
+
module DataFile
|
21
|
+
VERSION = 1
|
22
|
+
MAGIC = "Obj" + [VERSION].pack('c')
|
23
|
+
MAGIC_SIZE = MAGIC.size
|
24
|
+
SYNC_SIZE = 16
|
25
|
+
SYNC_INTERVAL = 1000 * SYNC_SIZE
|
26
|
+
META_SCHEMA = Schema.parse('{"type": "map", "values": "bytes"}')
|
27
|
+
VALID_ENCODINGS = ['binary'] # not used yet
|
28
|
+
|
29
|
+
class DataFileError < AvroError; end
|
30
|
+
|
31
|
+
def self.open(file_path, mode='r', schema=nil, codec=nil)
|
32
|
+
schema = Avro::Schema.parse(schema) if schema
|
33
|
+
case mode
|
34
|
+
when 'w'
|
35
|
+
unless schema
|
36
|
+
raise DataFileError, "Writing an Avro file requires a schema."
|
37
|
+
end
|
38
|
+
io = open_writer(File.open(file_path, 'wb'), schema, codec)
|
39
|
+
when 'r'
|
40
|
+
io = open_reader(File.open(file_path, 'rb'), schema)
|
41
|
+
else
|
42
|
+
raise DataFileError, "Only modes 'r' and 'w' allowed. You gave #{mode.inspect}."
|
43
|
+
end
|
44
|
+
|
45
|
+
yield io if block_given?
|
46
|
+
io
|
47
|
+
ensure
|
48
|
+
io.close if block_given? && io
|
49
|
+
end
|
50
|
+
|
51
|
+
def self.codecs
|
52
|
+
@codecs
|
53
|
+
end
|
54
|
+
|
55
|
+
def self.register_codec(codec)
|
56
|
+
@codecs ||= {}
|
57
|
+
codec = codec.new if !codec.respond_to?(:codec_name) && codec.is_a?(Class)
|
58
|
+
@codecs[codec.codec_name.to_s] = codec
|
59
|
+
end
|
60
|
+
|
61
|
+
def self.get_codec(codec)
|
62
|
+
codec ||= 'null'
|
63
|
+
if codec.respond_to?(:compress) && codec.respond_to?(:decompress)
|
64
|
+
codec # it's a codec instance
|
65
|
+
elsif codec.is_a?(Class)
|
66
|
+
codec.new # it's a codec class
|
67
|
+
elsif @codecs.include?(codec.to_s)
|
68
|
+
@codecs[codec.to_s] # it's a string or symbol (codec name)
|
69
|
+
else
|
70
|
+
raise DataFileError, "Unknown codec: #{codec.inspect}"
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
class << self
|
75
|
+
private
|
76
|
+
def open_writer(file, schema, codec=nil)
|
77
|
+
writer = Avro::IO::DatumWriter.new(schema)
|
78
|
+
Avro::DataFile::Writer.new(file, writer, schema, codec)
|
79
|
+
end
|
80
|
+
|
81
|
+
def open_reader(file, schema)
|
82
|
+
reader = Avro::IO::DatumReader.new(nil, schema)
|
83
|
+
Avro::DataFile::Reader.new(file, reader)
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
class Writer
|
88
|
+
def self.generate_sync_marker
|
89
|
+
OpenSSL::Random.random_bytes(16)
|
90
|
+
end
|
91
|
+
|
92
|
+
attr_reader :writer, :encoder, :datum_writer, :buffer_writer, :buffer_encoder, :sync_marker, :meta, :codec
|
93
|
+
attr_accessor :block_count
|
94
|
+
|
95
|
+
def initialize(writer, datum_writer, writers_schema=nil, codec=nil)
|
96
|
+
# If writers_schema is not present, presume we're appending
|
97
|
+
@writer = writer
|
98
|
+
@encoder = IO::BinaryEncoder.new(@writer)
|
99
|
+
@datum_writer = datum_writer
|
100
|
+
@buffer_writer = StringIO.new('', 'w')
|
101
|
+
@buffer_encoder = IO::BinaryEncoder.new(@buffer_writer)
|
102
|
+
@block_count = 0
|
103
|
+
|
104
|
+
@meta = {}
|
105
|
+
|
106
|
+
if writers_schema
|
107
|
+
@sync_marker = Writer.generate_sync_marker
|
108
|
+
@codec = DataFile.get_codec(codec)
|
109
|
+
meta['avro.codec'] = @codec.codec_name.to_s
|
110
|
+
meta['avro.schema'] = writers_schema.to_s
|
111
|
+
datum_writer.writers_schema = writers_schema
|
112
|
+
write_header
|
113
|
+
else
|
114
|
+
# open writer for reading to collect metadata
|
115
|
+
dfr = Reader.new(writer, Avro::IO::DatumReader.new)
|
116
|
+
|
117
|
+
# FIXME(jmhodges): collect arbitrary metadata
|
118
|
+
# collect metadata
|
119
|
+
@sync_marker = dfr.sync_marker
|
120
|
+
meta['avro.codec'] = dfr.meta['avro.codec']
|
121
|
+
@codec = DataFile.get_codec(meta['avro.codec'])
|
122
|
+
|
123
|
+
# get schema used to write existing file
|
124
|
+
schema_from_file = dfr.meta['avro.schema']
|
125
|
+
meta['avro.schema'] = schema_from_file
|
126
|
+
datum_writer.writers_schema = Schema.parse(schema_from_file)
|
127
|
+
|
128
|
+
# seek to the end of the file and prepare for writing
|
129
|
+
writer.seek(0,2)
|
130
|
+
end
|
131
|
+
end
|
132
|
+
|
133
|
+
# Append a datum to the file
|
134
|
+
def <<(datum)
|
135
|
+
datum_writer.write(datum, buffer_encoder)
|
136
|
+
self.block_count += 1
|
137
|
+
|
138
|
+
# if the data to write is larger than the sync interval, write
|
139
|
+
# the block
|
140
|
+
if buffer_writer.tell >= SYNC_INTERVAL
|
141
|
+
write_block
|
142
|
+
end
|
143
|
+
end
|
144
|
+
|
145
|
+
# Return the current position as a value that may be passed to
|
146
|
+
# DataFileReader.seek(long). Forces the end of the current block,
|
147
|
+
# emitting a synchronization marker.
|
148
|
+
def sync
|
149
|
+
write_block
|
150
|
+
writer.tell
|
151
|
+
end
|
152
|
+
|
153
|
+
# Flush the current state of the file, including metadata
|
154
|
+
def flush
|
155
|
+
write_block
|
156
|
+
writer.flush
|
157
|
+
end
|
158
|
+
|
159
|
+
def close
|
160
|
+
flush
|
161
|
+
writer.close
|
162
|
+
end
|
163
|
+
|
164
|
+
private
|
165
|
+
|
166
|
+
def write_header
|
167
|
+
# write magic
|
168
|
+
writer.write(MAGIC)
|
169
|
+
|
170
|
+
# write metadata
|
171
|
+
datum_writer.write_data(META_SCHEMA, meta, encoder)
|
172
|
+
|
173
|
+
# write sync marker
|
174
|
+
writer.write(sync_marker)
|
175
|
+
end
|
176
|
+
|
177
|
+
# TODO(jmhodges): make a schema for blocks and use datum_writer
|
178
|
+
# TODO(jmhodges): do we really need the number of items in the block?
|
179
|
+
def write_block
|
180
|
+
if block_count > 0
|
181
|
+
# write number of items in block and block size in bytes
|
182
|
+
encoder.write_long(block_count)
|
183
|
+
to_write = codec.compress(buffer_writer.string)
|
184
|
+
encoder.write_long(to_write.size)
|
185
|
+
|
186
|
+
# write block contents
|
187
|
+
writer.write(to_write)
|
188
|
+
|
189
|
+
# write sync marker
|
190
|
+
writer.write(sync_marker)
|
191
|
+
|
192
|
+
# reset buffer
|
193
|
+
buffer_writer.truncate(0)
|
194
|
+
buffer_writer.rewind
|
195
|
+
self.block_count = 0
|
196
|
+
end
|
197
|
+
end
|
198
|
+
end
|
199
|
+
|
200
|
+
# Read files written by DataFileWriter
|
201
|
+
class Reader
|
202
|
+
include ::Enumerable
|
203
|
+
|
204
|
+
# The reader and binary decoder for the raw file stream
|
205
|
+
attr_reader :reader, :decoder
|
206
|
+
|
207
|
+
# The binary decoder for the contents of a block (after codec decompression)
|
208
|
+
attr_reader :block_decoder
|
209
|
+
|
210
|
+
attr_reader :datum_reader, :sync_marker, :meta, :file_length, :codec
|
211
|
+
attr_accessor :block_count # records remaining in current block
|
212
|
+
|
213
|
+
def initialize(reader, datum_reader)
|
214
|
+
@reader = reader
|
215
|
+
@decoder = IO::BinaryDecoder.new(reader)
|
216
|
+
@datum_reader = datum_reader
|
217
|
+
|
218
|
+
# read the header: magic, meta, sync
|
219
|
+
read_header
|
220
|
+
|
221
|
+
@codec = DataFile.get_codec(meta['avro.codec'])
|
222
|
+
|
223
|
+
# get ready to read
|
224
|
+
@block_count = 0
|
225
|
+
datum_reader.writers_schema = Schema.parse meta['avro.schema']
|
226
|
+
end
|
227
|
+
|
228
|
+
# Iterates through each datum in this file
|
229
|
+
# TODO(jmhodges): handle block of length zero
|
230
|
+
def each
|
231
|
+
loop do
|
232
|
+
if block_count == 0
|
233
|
+
case
|
234
|
+
when eof?; break
|
235
|
+
when skip_sync
|
236
|
+
break if eof?
|
237
|
+
read_block_header
|
238
|
+
else
|
239
|
+
read_block_header
|
240
|
+
end
|
241
|
+
end
|
242
|
+
|
243
|
+
datum = datum_reader.read(block_decoder)
|
244
|
+
self.block_count -= 1
|
245
|
+
yield(datum)
|
246
|
+
end
|
247
|
+
end
|
248
|
+
|
249
|
+
def eof?; reader.eof?; end
|
250
|
+
|
251
|
+
def close
|
252
|
+
reader.close
|
253
|
+
end
|
254
|
+
|
255
|
+
private
|
256
|
+
def read_header
|
257
|
+
# seek to the beginning of the file to get magic block
|
258
|
+
reader.seek(0, 0)
|
259
|
+
|
260
|
+
# check magic number
|
261
|
+
magic_in_file = reader.read(MAGIC_SIZE)
|
262
|
+
if magic_in_file.size < MAGIC_SIZE
|
263
|
+
msg = 'Not an Avro data file: shorter than the Avro magic block'
|
264
|
+
raise DataFileError, msg
|
265
|
+
elsif magic_in_file != MAGIC
|
266
|
+
msg = "Not an Avro data file: #{magic_in_file.inspect} doesn't match #{MAGIC.inspect}"
|
267
|
+
raise DataFileError, msg
|
268
|
+
end
|
269
|
+
|
270
|
+
# read metadata
|
271
|
+
@meta = datum_reader.read_data(META_SCHEMA,
|
272
|
+
META_SCHEMA,
|
273
|
+
decoder)
|
274
|
+
# read sync marker
|
275
|
+
@sync_marker = reader.read(SYNC_SIZE)
|
276
|
+
end
|
277
|
+
|
278
|
+
def read_block_header
|
279
|
+
self.block_count = decoder.read_long
|
280
|
+
block_bytes = decoder.read_long
|
281
|
+
data = codec.decompress(reader.read(block_bytes))
|
282
|
+
@block_decoder = IO::BinaryDecoder.new(StringIO.new(data))
|
283
|
+
end
|
284
|
+
|
285
|
+
# read the length of the sync marker; if it matches the sync
|
286
|
+
# marker, return true. Otherwise, seek back to where we started
|
287
|
+
# and return false
|
288
|
+
def skip_sync
|
289
|
+
proposed_sync_marker = reader.read(SYNC_SIZE)
|
290
|
+
if proposed_sync_marker != sync_marker
|
291
|
+
reader.seek(-SYNC_SIZE, 1)
|
292
|
+
false
|
293
|
+
else
|
294
|
+
true
|
295
|
+
end
|
296
|
+
end
|
297
|
+
end
|
298
|
+
|
299
|
+
|
300
|
+
class NullCodec
|
301
|
+
def codec_name; 'null'; end
|
302
|
+
def decompress(data); data; end
|
303
|
+
def compress(data); data; end
|
304
|
+
end
|
305
|
+
|
306
|
+
class DeflateCodec
|
307
|
+
attr_reader :level
|
308
|
+
|
309
|
+
def initialize(level=Zlib::DEFAULT_COMPRESSION)
|
310
|
+
@level = level
|
311
|
+
end
|
312
|
+
|
313
|
+
def codec_name; 'deflate'; end
|
314
|
+
|
315
|
+
def decompress(compressed)
|
316
|
+
# Passing a negative number to Inflate puts it into "raw" RFC1951 mode
|
317
|
+
# (without the RFC1950 header & checksum). See the docs for
|
318
|
+
# inflateInit2 in http://www.zlib.net/manual.html
|
319
|
+
zstream = Zlib::Inflate.new(-Zlib::MAX_WBITS)
|
320
|
+
data = zstream.inflate(compressed)
|
321
|
+
data << zstream.finish
|
322
|
+
ensure
|
323
|
+
zstream.close
|
324
|
+
end
|
325
|
+
|
326
|
+
def compress(data)
|
327
|
+
zstream = Zlib::Deflate.new(level, -Zlib::MAX_WBITS)
|
328
|
+
compressed = zstream.deflate(data)
|
329
|
+
compressed << zstream.finish
|
330
|
+
ensure
|
331
|
+
zstream.close
|
332
|
+
end
|
333
|
+
end
|
334
|
+
|
335
|
+
DataFile.register_codec NullCodec
|
336
|
+
DataFile.register_codec DeflateCodec
|
337
|
+
|
338
|
+
# TODO this constant won't be updated if you register another codec.
|
339
|
+
# Deprecated in favor of Avro::DataFile::codecs
|
340
|
+
VALID_CODECS = DataFile.codecs.keys
|
341
|
+
end
|
342
|
+
end
|