avro 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1 @@
1
+ v0.0.1 stuff
@@ -0,0 +1,19 @@
1
+ CHANGELOG
2
+ Manifest
3
+ Rakefile
4
+ avro.gemspec
5
+ interop/test_interop.rb
6
+ lib/avro.rb
7
+ lib/avro/collect_hash.rb
8
+ lib/avro/data_file.rb
9
+ lib/avro/io.rb
10
+ lib/avro/ipc.rb
11
+ lib/avro/protocol.rb
12
+ lib/avro/schema.rb
13
+ test/random_data.rb
14
+ test/sample_ipc_client.rb
15
+ test/sample_ipc_server.rb
16
+ test/test_help.rb
17
+ test/test_io.rb
18
+ test/test_protocol.rb
19
+ tmp/test.rb.avro
@@ -0,0 +1,59 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ require 'rubygems'
18
+ require 'echoe'
19
+ VERSION = File.open('../../share/VERSION.txt').read
20
+ Echoe.new('avro', VERSION) do |p|
21
+ p.author = "Apache Software Foundation"
22
+ p.email = "avro-dev@hadoop.apache.org"
23
+ p.summary = "Apache Avro for Ruby"
24
+ p.description = "Apache is a data serialization and RPC format"
25
+ p.url = "http://hadoop.apache.org/avro/"
26
+ p.runtime_dependencies = %w[yajl-ruby]
27
+ end
28
+
29
+ t = Rake::TestTask.new(:interop)
30
+ t.pattern = 'interop/test*.rb'
31
+
32
+ task :generate_interop do
33
+ $:.unshift(HERE + '/lib')
34
+ $:.unshift(HERE + '/test')
35
+ require 'avro'
36
+ require 'random_data'
37
+
38
+ schema = Avro::Schema.parse(File.read(SCHEMAS + '/interop.avsc'))
39
+ r = RandomData.new(schema, ENV['SEED'])
40
+ f = File.open(BUILD + '/interop/data/ruby.avro', 'w')
41
+ writer = Avro::DataFile::Writer.new(f, Avro::IO::DatumWriter.new(schema), schema)
42
+ begin
43
+ writer << r.next
44
+ writer << r.next
45
+ ensure
46
+ writer.close
47
+ end
48
+ end
49
+
50
+
51
+ HERE = File.expand_path(File.dirname(__FILE__))
52
+ SHARE = HERE + '/../../share'
53
+ SCHEMAS = SHARE + '/test/schemas'
54
+ BUILD = HERE + '/../../build'
55
+
56
+ task :dist => [:manifest, :gem] do
57
+ mkdir_p "../../dist/ruby"
58
+ cp "pkg/avro-#{VERSION}.gem", "../../dist/ruby"
59
+ end
@@ -0,0 +1,34 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ Gem::Specification.new do |s|
4
+ s.name = %q{avro}
5
+ s.version = "1.3.0"
6
+
7
+ s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
8
+ s.authors = ["Apache Software Foundation"]
9
+ s.date = %q{2010-03-01}
10
+ s.description = %q{Apache is a data serialization and RPC format}
11
+ s.email = %q{avro-dev@hadoop.apache.org}
12
+ s.extra_rdoc_files = ["CHANGELOG", "lib/avro.rb", "lib/avro/collect_hash.rb", "lib/avro/data_file.rb", "lib/avro/io.rb", "lib/avro/ipc.rb", "lib/avro/protocol.rb", "lib/avro/schema.rb"]
13
+ s.files = ["CHANGELOG", "Manifest", "Rakefile", "avro.gemspec", "interop/test_interop.rb", "lib/avro.rb", "lib/avro/collect_hash.rb", "lib/avro/data_file.rb", "lib/avro/io.rb", "lib/avro/ipc.rb", "lib/avro/protocol.rb", "lib/avro/schema.rb", "test/random_data.rb", "test/sample_ipc_client.rb", "test/sample_ipc_server.rb", "test/test_help.rb", "test/test_io.rb", "test/test_protocol.rb", "tmp/test.rb.avro"]
14
+ s.homepage = %q{http://hadoop.apache.org/avro/}
15
+ s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Avro"]
16
+ s.require_paths = ["lib"]
17
+ s.rubyforge_project = %q{avro}
18
+ s.rubygems_version = %q{1.3.5}
19
+ s.summary = %q{Apache Avro for Ruby}
20
+ s.test_files = ["test/test_help.rb", "test/test_io.rb", "test/test_protocol.rb"]
21
+
22
+ if s.respond_to? :specification_version then
23
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
24
+ s.specification_version = 3
25
+
26
+ if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
27
+ s.add_runtime_dependency(%q<yajl-ruby>, [">= 0"])
28
+ else
29
+ s.add_dependency(%q<yajl-ruby>, [">= 0"])
30
+ end
31
+ else
32
+ s.add_dependency(%q<yajl-ruby>, [">= 0"])
33
+ end
34
+ end
@@ -0,0 +1,41 @@
1
+ #!/usr/bin/env ruby
2
+ # Licensed to the Apache Software Foundation (ASF) under one
3
+ # or more contributor license agreements. See the NOTICE file
4
+ # distributed with this work for additional information
5
+ # regarding copyright ownership. The ASF licenses this file
6
+ # to you under the Apache License, Version 2.0 (the
7
+ # "License"); you may not use this file except in compliance
8
+ # with the License. You may obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ # See the License for the specific language governing permissions and
16
+ # limitations under the License.
17
+
18
+ require 'rubygems'
19
+ require 'test/unit'
20
+ require 'avro'
21
+
22
+ class TestInterop < Test::Unit::TestCase
23
+ HERE = File.expand_path(File.dirname(__FILE__))
24
+ SHARE = HERE + '/../../../share'
25
+ SCHEMAS = SHARE + '/test/schemas'
26
+ Dir[HERE + '/../../../build/interop/data/*'].each do |fn|
27
+ define_method("test_read_#{File.basename(fn, 'avro')}") do
28
+ projection = Avro::Schema.parse(File.read(SCHEMAS+'/interop.avsc'))
29
+
30
+ File.open(fn) do |f|
31
+ r = Avro::DataFile::Reader.new(f, Avro::IO::DatumReader.new(projection))
32
+ i = 0
33
+ r.each do |datum|
34
+ i += 1
35
+ assert_not_nil datum, "nil datum from #{fn}"
36
+ end
37
+ assert_not_equal 0, i, "no data read in from #{fn}"
38
+ end
39
+ end
40
+ end
41
+ end
@@ -0,0 +1,39 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ require 'yajl'
18
+ require 'set'
19
+ require 'md5'
20
+
21
+ module Avro
22
+ VERSION = "FIXME"
23
+
24
+ class AvroError < StandardError; end
25
+
26
+ class AvroTypeError < Avro::AvroError
27
+ def initialize(schm=nil, datum=nil, msg=nil)
28
+ msg ||= "Not a #{schm.to_s}: #{datum}"
29
+ super(msg)
30
+ end
31
+ end
32
+ end
33
+
34
+ require 'avro/collect_hash'
35
+ require 'avro/schema'
36
+ require 'avro/io'
37
+ require 'avro/data_file'
38
+ require 'avro/protocol'
39
+ require 'avro/ipc'
@@ -0,0 +1,25 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ module Enumerable
18
+ def collect_hash
19
+ inject(Hash.new) do |memo, i|
20
+ k, v = yield(i)
21
+ memo[k] = v if k
22
+ memo
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,243 @@
1
+ # Licensed to the Apache Software Foundation (ASF) under one
2
+ # or more contributor license agreements. See the NOTICE file
3
+ # distributed with this work for additional information
4
+ # regarding copyright ownership. The ASF licenses this file
5
+ # to you under the Apache License, Version 2.0 (the
6
+ # "License"); you may not use this file except in compliance
7
+ # with the License. You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ require 'openssl'
18
+
19
+ module Avro
20
+ module DataFile
21
+ VERSION = 1
22
+ MAGIC = "Obj" + [VERSION].pack('c')
23
+ MAGIC_SIZE = MAGIC.size
24
+ SYNC_SIZE = 16
25
+ SYNC_INTERVAL = 1000 * SYNC_SIZE
26
+ META_SCHEMA = Schema.parse('{"type": "map", "values": "bytes"}')
27
+ VALID_CODECS = ['null']
28
+ VALID_ENCODINGS = ['binary'] # not used yet
29
+
30
+ class DataFileError < AvroError; end
31
+
32
+ class Writer
33
+ def self.generate_sync_marker
34
+ OpenSSL::Random.random_bytes(16)
35
+ end
36
+
37
+ attr_reader :writer, :encoder, :datum_writer, :buffer_writer, :buffer_encoder, :sync_marker, :meta
38
+ attr_accessor :block_count
39
+
40
+ def initialize(writer, datum_writer, writers_schema=nil)
41
+ # If writers_schema is not present, presume we're appending
42
+ @writer = writer
43
+ @encoder = IO::BinaryEncoder.new(@writer)
44
+ @datum_writer = datum_writer
45
+ @buffer_writer = StringIO.new('', 'w')
46
+ @buffer_encoder = IO::BinaryEncoder.new(@buffer_writer)
47
+ @block_count = 0
48
+
49
+ @meta = {}
50
+
51
+ if writers_schema
52
+ @sync_marker = Writer.generate_sync_marker
53
+ meta['avro.codec'] = 'null'
54
+ meta['avro.schema'] = writers_schema.to_s
55
+ datum_writer.writers_schema = writers_schema
56
+ write_header
57
+ else
58
+ # open writer for reading to collect metadata
59
+ dfr = Reader.new(writer, Avro::IO::DatumReader.new)
60
+
61
+ # FIXME(jmhodges): collect arbitrary metadata
62
+ # collect metadata
63
+ @sync_marker = dfr.sync_marker
64
+ meta['avro.codec'] = dfr.meta['avro.codec']
65
+
66
+ # get schema used to write existing file
67
+ schema_from_file = dfr.meta['avro.schema']
68
+ meta['avro.schema'] = schema_from_file
69
+ datum_writer.writers_schema = Schema.parse(schema_from_file)
70
+
71
+ # seek to the end of the file and prepare for writing
72
+ writer.seek(0,2)
73
+ end
74
+ end
75
+
76
+ # Append a datum to the file
77
+ def <<(datum)
78
+ datum_writer.write(datum, buffer_encoder)
79
+ self.block_count += 1
80
+
81
+ # if the data to write is larger than the sync interval, write
82
+ # the block
83
+ if buffer_writer.tell >= SYNC_INTERVAL
84
+ write_block
85
+ end
86
+ end
87
+
88
+ # Return the current position as a value that may be passed to
89
+ # DataFileReader.seek(long). Forces the end of the current block,
90
+ # emitting a synchronization marker.
91
+ def sync
92
+ write_block
93
+ writer.tell
94
+ end
95
+
96
+ # Flush the current state of the file, including metadata
97
+ def flush
98
+ write_block
99
+ writer.flush
100
+ end
101
+
102
+ def close
103
+ flush
104
+ writer.close
105
+ end
106
+
107
+ private
108
+
109
+ def write_header
110
+ # write magic
111
+ writer.write(MAGIC)
112
+
113
+ # write metadata
114
+ datum_writer.write_data(META_SCHEMA, meta, encoder)
115
+
116
+ # write sync marker
117
+ writer.write(sync_marker)
118
+ end
119
+
120
+ # TODO(jmhodges): make a schema for blocks and use datum_writer
121
+ # TODO(jmhodges): do we really need the number of items in the block?
122
+ # TODO(jmhodges): use codec when writing the block contents
123
+ def write_block
124
+ if block_count > 0
125
+ # write number of items in block and block size in bytes
126
+ encoder.write_long(block_count)
127
+ to_write = buffer_writer.string
128
+ encoder.write_long(to_write.size)
129
+
130
+ # write block contents
131
+ if meta['avro.codec'] == 'null'
132
+ writer.write(to_write)
133
+ else
134
+ msg = "#{meta['avro.codec'].inspect} coded is not supported"
135
+ raise DataFileError, msg
136
+ end
137
+
138
+ # write sync marker
139
+ writer.write(sync_marker)
140
+
141
+ # reset buffer
142
+ buffer_writer.truncate(0)
143
+ self.block_count = 0
144
+ end
145
+ end
146
+ end
147
+
148
+ # Read files written by DataFileWriter
149
+ class Reader
150
+ include ::Enumerable
151
+
152
+ attr_reader :reader, :decoder, :datum_reader, :sync_marker, :meta, :file_length
153
+ attr_accessor :block_count
154
+
155
+ def initialize(reader, datum_reader)
156
+ @reader = reader
157
+ @decoder = IO::BinaryDecoder.new(reader)
158
+ @datum_reader = datum_reader
159
+
160
+ # read the header: magic, meta, sync
161
+ read_header
162
+
163
+ # ensure the codec is valid
164
+ codec_from_file = meta['avro.codec']
165
+ if codec_from_file && ! VALID_CODECS.include?(codec_from_file)
166
+ raise DataFileError, "Unknown codec: #{codec_from_file}"
167
+ end
168
+
169
+ # get ready to read
170
+ @block_count = 0
171
+ datum_reader.writers_schema = Schema.parse meta['avro.schema']
172
+ end
173
+
174
+ # Iterates through each datum in this file
175
+ # TODO(jmhodges): handle block of length zero
176
+ def each
177
+ loop do
178
+ if block_count == 0
179
+ case
180
+ when eof?; break
181
+ when skip_sync
182
+ break if eof?
183
+ read_block_header
184
+ else
185
+ read_block_header
186
+ end
187
+ end
188
+
189
+ datum = datum_reader.read(decoder)
190
+ self.block_count -= 1
191
+ yield(datum)
192
+ end
193
+ end
194
+
195
+ def eof?; reader.eof?; end
196
+
197
+ def close
198
+ reader.close
199
+ end
200
+
201
+ private
202
+ def read_header
203
+ # seek to the beginning of the file to get magic block
204
+ reader.seek(0, 0)
205
+
206
+ # check magic number
207
+ magic_in_file = reader.read(MAGIC_SIZE)
208
+ if magic_in_file.size < MAGIC_SIZE
209
+ msg = 'Not an Avro data file: shorter than the Avro magic block'
210
+ raise DataFileError, msg
211
+ elsif magic_in_file != MAGIC
212
+ msg = "Not an Avro data file: #{magic_in_file.inspect} doesn't match #{MAGIC.inspect}"
213
+ raise DataFileError, msg
214
+ end
215
+
216
+ # read metadata
217
+ @meta = datum_reader.read_data(META_SCHEMA,
218
+ META_SCHEMA,
219
+ decoder)
220
+ # read sync marker
221
+ @sync_marker = reader.read(SYNC_SIZE)
222
+ end
223
+
224
+ def read_block_header
225
+ self.block_count = decoder.read_long
226
+ decoder.read_long # not doing anything with length in bytes
227
+ end
228
+
229
+ # read the length of the sync marker; if it matches the sync
230
+ # marker, return true. Otherwise, seek back to where we started
231
+ # and return false
232
+ def skip_sync
233
+ proposed_sync_marker = reader.read(SYNC_SIZE)
234
+ if proposed_sync_marker != sync_marker
235
+ reader.seek(-SYNC_SIZE, 1)
236
+ false
237
+ else
238
+ true
239
+ end
240
+ end
241
+ end
242
+ end
243
+ end